In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [38]:
data = pd.read_csv(r'twitter_training.csv', header=None)
data.columns = ['#', 'topic', 'sentiment', 'text']
data.dropna(inplace = True, axis = 0)
data.duplicated().sum()
data.drop_duplicates(inplace=True)

In [40]:
le = preprocessing.LabelEncoder()
le.fit(["Negative", "Positive", "Neutral", "Irrelevant"])
le.transform(["Positive", "Neutral", "Negative", "Irrelevant"])

array([3, 2, 1, 0], dtype=int64)

In [41]:
y = data['sentiment']
X = data['text']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=3, stratify=y)

In [43]:
X_train

3318             God, this makes my job even harder thanks
67760              I can't wait for this game to come out.
54176    @ CfDuty @ Afghanistan Whether you can remove ...
8350     L i felt like throwing up da whole time while ...
45958    Percell James Wright Little Virginia Verizon B...
                               ...                        
8387     ever since i was called funny ive never be abl...
55703    This is a Call<unk> Duty Zombies fan account p...
65390                      Y'all respect the white so much
59432    Wanna Know How You Can Tell When The Truth is ...
2642     Call of Duty Black Ops Cold War aka How to com...
Name: text, Length: 57324, dtype: object

In [44]:
y_test.value_counts() / len(y_test)

Negative      0.302819
Positive      0.275119
Neutral       0.247139
Irrelevant    0.174923
Name: sentiment, dtype: float64

In [45]:
y_train.value_counts() / len(y_train)

Negative      0.302805
Positive      0.275103
Neutral       0.247122
Irrelevant    0.174970
Name: sentiment, dtype: float64

In [46]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)

In [47]:
log_reg = LogisticRegression(solver='lbfgs', max_iter=1000).fit(count_train, le.transform(y_train))

In [48]:
predict = log_reg.predict(count_test)

In [49]:
predict

array([1, 2, 1, ..., 0, 1, 2])

In [50]:
print('Accuracy score is', accuracy_score(le.transform(y_test), predict))

Accuracy score is 0.8449623220764723


In [51]:
# Compating the result with the training data
metrics.confusion_matrix(le.transform(y_test), predict, labels=[0,1,2,3]) 

array([[1963,  166,  108,  270],
       [  86, 3787,  149,  318],
       [  88,  211, 2873,  370],
       [  86,  217,  153, 3487]], dtype=int64)

In [52]:
result = pd.DataFrame(metrics.confusion_matrix(le.transform(y_test), predict), columns = ["Negative", "Positive", "Neutral", "Irrelevant"], index= ["Negative", "Positive", "Neutral", "Irrelevant"])

In [53]:
result

Unnamed: 0,Negative,Positive,Neutral,Irrelevant
Negative,1963,166,108,270
Positive,86,3787,149,318
Neutral,88,211,2873,370
Irrelevant,86,217,153,3487


In [54]:
# Embedding with bigrams
min_n = 2 
max_n = 2

In [55]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(min_n, max_n))
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)

In [56]:
log_reg = LogisticRegression(solver='lbfgs', max_iter=1000, penalty='l2').fit(count_train, le.transform(y_train))

In [57]:
predict = log_reg.predict(count_test)

In [58]:
print('Accuracy score is', accuracy_score(le.transform(y_test), predict))

Accuracy score is 0.9037817471392687


In [59]:
metrics.confusion_matrix(le.transform(y_test), predict, labels=[0,1,2,3])

array([[2121,   48,   36,  302],
       [   8, 3891,   35,  406],
       [   6,   58, 3112,  366],
       [  11,   47,   56, 3829]], dtype=int64)

In [60]:
result

Unnamed: 0,Negative,Positive,Neutral,Irrelevant
Negative,1963,166,108,270
Positive,86,3787,149,318
Neutral,88,211,2873,370
Irrelevant,86,217,153,3487
