In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

In [2]:
df = pd.read_csv('clean_tweets.csv')

In [3]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df['sentiment'] = df['sentiment'].map({0:0,4:1})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1600000 non-null int64
text         1596714 non-null object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [5]:
df.dropna(inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1596714 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1596714 non-null int64
text         1596714 non-null object
dtypes: int64(1), object(1)
memory usage: 36.5+ MB


## Train/ Dev/ Test Split

We will split the data into three sections: train, development and test. Our chosen ratio is 98/1/1 i.e. 98% for the training set, 1% for the development set and 1% for the testing set.
* Train set: The dataset used for learning
* Development Set: A validation/development dataset is a sample of data held back from training your model that is used to give an estimate of model skill while tuning model’s hyperparameters.
* Test Set: The dataset used to assess the performance of a model.


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x = df['text'] #define all other columns except the target variable
y = df['sentiment'] #define the target variable

x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size = 0.02, random_state = 42)

x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, 
                                                              test_size = 0.5, random_state = 42)

In [9]:
print("Training set has {0} entries, where {1:.2f} are positive and {2:.2f} are negative".
      format(len(x_train),len(x_train[y_train==1])/len(x_train)*100, len(x_train[y_train==0])/len(x_train)*100))
print("Validation set has {0} entries, where {1:.2f} are positive and {2:.2f} are negative".
      format(len(x_validation),len(x_validation[y_validation==1])/len(x_validation)*100, 
             len(x_validation[y_validation==0])/len(x_validation)*100))
print("Testing set has {0} entries, where {1:.2f} are positive and {2:.2f} are negative".
      format(len(x_test),len(x_test[y_test==1])/len(x_test)*100, 
             len(x_test[y_test==0])/len(x_test)*100))

Training set has 1564779 entries, where 49.99 are positive and 50.01 are negative
Validation set has 15967 entries, where 49.82 are positive and 50.18 are negative
Testing set has 15968 entries, where 50.33 are positive and 49.67 are negative


## Baseline

### TextBlob

We will use the sentiment analysis function of TextBlob as baseline for our project. It will provide us a point of reference to compare our future models.

In [10]:
from textblob import TextBlob

In [11]:
tbresult = []
for i in x_validation:
    tbresult.append(TextBlob(i).sentiment.polarity)

In [12]:
tbpred = []
for i in tbresult:
    if i<0:
        tbpred.append(0)
    else:
        tbpred.append(1)    

In [13]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [19]:
conmat = np.array(confusion_matrix(y_validation, tbpred, labels=[1,0]))
confusion = pd.DataFrame(conmat, index=['positive', 'negative'], columns=['predicted_positive', 'predicted_negative'])
print("Accuracy score: {0:.2f} %".format(accuracy_score(y_validation, tbpred)*100))

Accuracy score: 61.41 %


In [20]:
print("Confusion Matrix")
print(confusion)

Confusion Matrix
          predicted_positive  predicted_negative
positive                7136                 818
negative                5344                2669


In [21]:
print("Classification Report")
print(classification_report(y_validation, tbpred))

Classification Report
              precision    recall  f1-score   support

           0       0.77      0.33      0.46      8013
           1       0.57      0.90      0.70      7954

   micro avg       0.61      0.61      0.61     15967
   macro avg       0.67      0.62      0.58     15967
weighted avg       0.67      0.61      0.58     15967



Hence, TextBlob sentiment analysis yielded 61.41% accuracy.