<a href="https://colab.research.google.com/github/rickwag/ML/blob/main/TwitsSentimentAnalysis(SVM).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Sentiment Analysis
### Dataset => Twits

In [13]:
#data acquisation
data_url = "/content/sample_data/twitter_training.csv"

import pandas as pd

df = pd.read_csv(data_url, engine = "python")

In [14]:
df.head(5)

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [15]:
df.shape

(24003, 4)

### Data Cleaning

In [16]:
#drop empty rows if any
df.dropna(inplace = True)

In [17]:
twits = df.iloc[:, -1]

In [18]:
#lowercasing
df.iloc[:, -1] = [str(twit).lower() for twit in twits]

In [19]:
df.head(5)

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,i am coming to the borders and i will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


### Preprocessing

In [20]:
#tokenization
import nltk
nltk.download("punkt")

from nltk.tokenize import word_tokenize

df.iloc[:, -1] = [word_tokenize(twit) for twit in twits]

df.head(5)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  return asarray(a).ndim


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,"[i, am, coming, to, the, borders, and, i, will..."
1,2401,Borderlands,Positive,"[im, getting, on, borderlands, and, i, will, k..."
2,2401,Borderlands,Positive,"[im, coming, on, borderlands, and, i, will, mu..."
3,2401,Borderlands,Positive,"[im, getting, on, borderlands, 2, and, i, will..."
4,2401,Borderlands,Positive,"[im, getting, into, borderlands, and, i, can, ..."


In [21]:
#stopwords removal and lemmatization
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")

lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords
nltk.download("stopwords")

for index, twit in enumerate(df.iloc[:, -1]):
  new_doc = []
  for token in twit:
    if token not in stopwords.words("english"): #if it's not a stopword
      lemma = lemmatizer.lemmatize(token)
      new_doc.append(lemma)
  
  df.loc[index, "final_twits"] = str(new_doc)

df.head(5)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,",final_twits
0,2401.0,Borderlands,Positive,"[i, am, coming, to, the, borders, and, i, will...","['coming', 'border', 'kill', ',']"
1,2401.0,Borderlands,Positive,"[im, getting, on, borderlands, and, i, will, k...","['im', 'getting', 'borderland', 'kill', ',']"
2,2401.0,Borderlands,Positive,"[im, coming, on, borderlands, and, i, will, mu...","['im', 'coming', 'borderland', 'murder', ',']"
3,2401.0,Borderlands,Positive,"[im, getting, on, borderlands, 2, and, i, will...","['im', 'getting', 'borderland', '2', 'murder',..."
4,2401.0,Borderlands,Positive,"[im, getting, into, borderlands, and, i, can, ...","['im', 'getting', 'borderland', 'murder', ',']"


In [None]:
df.dropna(subset=["final_twits"], inplace = True)

In [22]:
#drop empty rows 
df.dropna(subset=["Positive"], inplace = True)

df["Positive"].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [23]:
#drop the irrelevant rows
df_irrelevant = df[df["Positive"] == "Irrelevant"]
df_irrelevant.shape

df.drop(df.index[df["Positive"] == "Irrelevant"], inplace=True) 
df["Positive"].unique()

array(['Positive', 'Neutral', 'Negative'], dtype=object)

In [52]:
#splitting data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df["final_twits"], df["Positive"], test_size=.2)

### Encoding (Labels)

In [50]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

### Text Representation (Vectorization)

In [53]:
#TfIdf vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(df["final_twits"])

X_train = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

### Modelling



In [86]:
#create model
#from sklearn.linear_model import LogisticRegression
from sklearn import svm

#model = LogisticRegression(max_iter=80)
model = svm.SVC(kernel="linear", gamma="auto")

In [87]:
#train model
model.fit(X_train, y_train)

SVC(gamma='auto', kernel='linear')

### Model evaluation

In [88]:
#accuracy
model.score(X_test_vec, y_test)

0.7009079118028534

In [89]:
y_predictions = model.predict(X_test_vec)

In [90]:
#f1 score
from sklearn.metrics import classification_report

print(classification_report(y_test, y_predictions))

              precision    recall  f1-score   support

    Negative       0.72      0.66      0.69      1216
     Neutral       0.73      0.64      0.68      1198
    Positive       0.67      0.78      0.72      1441

    accuracy                           0.70      3855
   macro avg       0.71      0.70      0.70      3855
weighted avg       0.70      0.70      0.70      3855

