In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from time import time
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import models,layers
from tensorflow import keras
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../input/sentiment140/training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1",header = None)
df.columns = ['labels','time','date','qurey','username','tweet']
df.head(10)

Unnamed: 0,labels,time,date,qurey,username,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [3]:
df.describe()

Unnamed: 0,labels,time
count,1600000.0,1600000.0
mean,2.0,1998818000.0
std,2.000001,193576100.0
min,0.0,1467810000.0
25%,0.0,1956916000.0
50%,2.0,2002102000.0
75%,4.0,2177059000.0
max,4.0,2329206000.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   labels    1600000 non-null  int64 
 1   time      1600000 non-null  int64 
 2   date      1600000 non-null  object
 3   qurey     1600000 non-null  object
 4   username  1600000 non-null  object
 5   tweet     1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [5]:
df.labels.value_counts()

0    800000
4    800000
Name: labels, dtype: int64

> df_pos is positive tweets and positive tweets labels is 4
> df_neg is negative tweets and negative tweets labels is 0

In [6]:
df_pos = df[df['labels'] == 4]
df_neg = df[df['labels'] == 0]
print(len(df_pos), len(df_neg))

800000 800000


# Preprocessing Dataset

# 1. Tokenization

In [7]:
data1=df[df['labels']==0].sample(2500)
data2=df[df['labels']==4].sample(2500)

In [8]:
df=data1.append(data2)

In [9]:
print(df.shape)

(5000, 6)


In [10]:
text = df.tweet.to_list()
token = Tokenizer()
token.fit_on_texts(text)

In [11]:
len(token.index_word)

11588

In [12]:
vocab_size = len(token.word_index)+1
vocab_size

11589

In [13]:
encoded_text=token.texts_to_sequences(text)

In [14]:
max_length=200
x=sequence.pad_sequences(encoded_text,maxlen=max_length,padding='pre')
x

array([[    0,     0,     0, ...,   104,   453,   844],
       [    0,     0,     0, ...,   343,    25,   114],
       [    0,     0,     0, ...,   211,   221,  1417],
       ...,
       [    0,     0,     0, ...,   959, 11583, 11584],
       [    0,     0,     0, ...,  1189,    46,    34],
       [    0,     0,     0, ...,    12,     5,  1477]], dtype=int32)

In [15]:
x.shape

(5000, 200)

In [16]:
y=df['labels']
y.head()

644295    0
468304    0
669586    0
248227    0
664507    0
Name: labels, dtype: int64

In [17]:
y.value_counts()

0    2500
4    2500
Name: labels, dtype: int64

In [18]:
y=y.map({0:0,4:1})
y.shape

(5000,)

In [19]:
y.value_counts()

0    2500
1    2500
Name: labels, dtype: int64

# ****Model Training****

# ANN

In [39]:
vec_size=500
nn=models.Sequential([
    
    layers.Embedding(vocab_size,vec_size,input_length=max_length),
    layers.Conv1D(64,8,activation='relu'),
    layers.MaxPooling1D(2),
    layers.Dropout(0.8),
    
    layers.Dense(40,activation='relu'),
    layers.Dropout(0.9),
    
    layers.Dense(29,activation='relu'),
    layers.GlobalMaxPooling1D(),
    
    layers.Dense(1,activation='sigmoid')
    
    
    ])

In [21]:
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 500)          5794500   
_________________________________________________________________
conv1d (Conv1D)              (None, 193, 64)           256064    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 96, 64)            0         
_________________________________________________________________
dropout (Dropout)            (None, 96, 64)            0         
_________________________________________________________________
dense (Dense)                (None, 96, 40)            2600      
_________________________________________________________________
dropout_1 (Dropout)          (None, 96, 40)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 96, 29)            1

In [22]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30)
nn.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
nn.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=25)

Epoch 1/25


2022-11-14 08:01:19.514905: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-11-14 08:01:21.758185: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f2bf154f150>

In [23]:
nn.evaluate(x_train,y_train)



[0.08559820055961609, 0.9800000190734863]

In [24]:
nn.evaluate(x_test,y_test)



[0.8210312724113464, 0.6313333511352539]

In [25]:
x_test=np.array(x_test)
x_test

array([[    0,     0,     0, ..., 10745,    34,    50],
       [    0,     0,     0, ...,  7331,     9,   108],
       [    0,     0,     0, ...,    43,    26,   254],
       ...,
       [    0,     0,     0, ...,  2603,     9,    24],
       [    0,     0,     0, ...,   130,   150,    89],
       [    0,     0,     0, ...,  4481,    87,    42]], dtype=int32)

In [26]:
y_train

1539043    1
1256874    1
1157767    1
1234301    1
190275     0
          ..
949826     1
1316133    1
1151206    1
1367784    1
1380763    1
Name: labels, Length: 3500, dtype: int64

In [27]:
preds=nn.predict(x_test)
preds=(preds>0.5)
preds[:5]

array([[ True],
       [ True],
       [ True],
       [ True],
       [ True]])

In [28]:
def get_encoded(x):
    x=token.texts_to_sequences(x)
    x=sequence.pad_sequences(x,maxlen=max_length,padding='pre')
    return x

In [29]:
h=['he is a cute boy who loves to help others']
h = get_encoded(h)
nn.predict(h)

array([[0.95499855]], dtype=float32)

In [30]:
h = ['I hate you']
h = get_encoded(h)
nn.predict(h)

array([[0.19985792]], dtype=float32)

# RNN

In [31]:
model = keras.Sequential()
model.add(layers.Embedding(input_dim=1000, output_dim=64))

# The output of GRU will be a 3D tensor of shape (batch_size, timesteps, 256)
model.add(layers.GRU(256, return_sequences=True))

# The output of SimpleRNN will be a 2D tensor of shape (batch_size, 128)
model.add(layers.SimpleRNN(128))

model.add(layers.Dense(10))

model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          64000     
_________________________________________________________________
gru (GRU)                    (None, None, 256)         247296    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128)               49280     
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1290      
Total params: 361,866
Trainable params: 361,866
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f2bea0a6510>

In [None]:
model.evaluate(x_train,y_train)

# Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty="l2",random_state=42,C=0.01)
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Training Accuracy",clf.score(x_train, y_train))
print("Testing Accuracy",clf.score(x_test,y_pred))

Training Accuracy 0.504
Testing Accuracy 1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
