In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('fivethirtyeight')

In [2]:
df = pd.read_csv('clean_tweets.csv')

In [3]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df['sentiment'] = df['sentiment'].map({0:0,4:1})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1600000 non-null int64
text         1596714 non-null object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [5]:
df.dropna(inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1596714 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1596714 non-null int64
text         1596714 non-null object
dtypes: int64(1), object(1)
memory usage: 36.5+ MB


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x = df['text'] #define all other columns except the target variable
y = df['sentiment'] #define the target variable

x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size = 0.02, random_state = 42)

x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, 
                                                              test_size = 0.5, random_state = 42)

In [9]:
print("Training set has {0} entries, where {1:.2f}% are positive and {2:.2f}% are negative".
      format(len(x_train),len(x_train[y_train==1])/len(x_train)*100, len(x_train[y_train==0])/len(x_train)*100))
print("Validation set has {0} entries, where {1:.2f}% are positive and {2:.2f}% are negative".
      format(len(x_validation),len(x_validation[y_validation==1])/len(x_validation)*100, 
             len(x_validation[y_validation==0])/len(x_validation)*100))
print("Testing set has {0} entries, where {1:.2f}% are positive and {2:.2f}% are negative".
      format(len(x_test),len(x_test[y_test==1])/len(x_test)*100, 
             len(x_test[y_test==0])/len(x_test)*100))

Training set has 1564779 entries, where 49.99% are positive and 50.01% are negative
Validation set has 15967 entries, where 49.82% are positive and 50.18% are negative
Testing set has 15968 entries, where 50.33% are positive and 49.67% are negative


In [Fifth.ipynb](https://github.com/rkritika1508/Sentiment-Analysis/blob/master/Fifth.ipynb), we saw that TfidfVectorizer at 90000 features upto bigram gives the highest validation accuracy at 82.45% using Logistic Regression.

## ANN with TF-IDF Vectorizer

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
%%time
tvec = TfidfVectorizer(max_features=90000, ngram_range=(1,2))
tvec.fit(x_train)

Wall time: 1min 51s


In [12]:
x_train_tfidf = tvec.transform(x_train)
x_validation_tfidf = tvec.transform(x_validation).toarray()

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
%%time
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs')
clf.fit(x_train_tfidf, y_train)

Wall time: 51.9 s


In [16]:
clf.score(x_train_tfidf, y_train)

0.8378691176198044

In [17]:
clf.score(x_validation_tfidf, y_validation)

0.8249514623911818

In [18]:
#fix random seed for reproducibility
np.random.seed(7)

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

Using TensorFlow backend.


In [19]:
x_train_tfidf.shape

(1564779, 90000)

In [20]:
np.shape(x_train_tfidf)[0]

1564779

In [21]:
y_train.shape

(1564779,)

In [22]:
def batch_gen(X_data, Y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    no_of_batches = samples_per_epoch/batch_size
    counter = 0
    index = np.arange(Y_data.shape[0])
    while 1:
        index_batch = index[batch_size*counter : batch_size*(counter + 1)]
        X_batch = X_data[index_batch, :].toarray()
        Y_batch = Y_data[Y_data.index[index_batch]]
        counter += 1
        yield X_batch, Y_batch
        if (counter> no_of_batches):
            counter = 0

In [23]:
%%time
model = Sequential()
model.add(Dense(64, activation='relu', input_dim = 90000))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

model.fit_generator(generator = batch_gen(x_train_tfidf, y_train, 32), epochs = 5, 
                   validation_data = (x_validation_tfidf, y_validation), steps_per_epoch = x_train_tfidf.shape[0]/32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Wall time: 11h 18min 44s


In [23]:
%%time
from sklearn.preprocessing import Normalizer
norm = Normalizer().fit(x_train_tfidf)
x_train_tfidf_norm = norm.transform(x_train_tfidf)
x_validation_tfidf_norm = norm.transform(x_validation_tfidf)

Wall time: 4min 37s


In [None]:
%%time
model_n = Sequential()
model_n.add(Dense(64, activation='relu', input_dim = 90000))
model_n.add(Dense(1, activation='sigmoid'))

model_n.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

model_n.fit_generator(generator = batch_gen(x_train_tfidf_norm, y_train, 32), epochs = 5, 
                      validation_data = (x_validation_tfidf_norm, y_validation), 
                      steps_per_epoch = x_train_tfidf_norm.shape[0]/32)