In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers import Embedding
from keras.preprocessing import sequence
import random
SEED = 2000
np_seed = 7
np.random.seed(np_seed)

In [None]:
# for google colab

from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv("/content/drive/MyDrive/clean_tweet.csv", index_col=0)
data1 = data.head(10000)
data2 = data.tail(10000)

data = pd.concat([data1, data2])

In [16]:
data = pd.read_csv("../data/clean_tweet.csv", index_col=0)
data1 = data.head(100000)
data2 = data.tail(100000)

data = pd.concat([data1, data2])
data.head()

Unnamed: 0,text,target
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many times for the ball managed to save ...,0
3,my whole body feels itchy and like its on fire,0
4,no it not behaving at all mad why am here beca...,0


In [17]:
x = data.text
y = data.target

In [18]:
x_train, x_validation_test, y_train, y_validation_test = train_test_split(x,y, test_size=0.1, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_test, y_validation_test, test_size=0.5, random_state=SEED)

In [19]:
print(f"Train set has total {len(x_train)} with {len(x_train[y_train==0])*100/(len(x_train)*1.0)}% negative and {len(x_train[y_train==1])*100/(len(x_train)*1.0)}% positive")
print(f"Validation set has total {len(x_validation)} with {len(x_validation[y_validation==0])*100/(len(x_validation)*1.0)}% negative and {len(x_validation[y_validation==1])*100/(len(x_validation)*1.0)}% positive")
print(f"Test set has total {len(x_test)} with {len(x_test[y_test==0])*100/(len(x_test)*1.0)}% negative and {len(x_test[y_test==1])*100/(len(x_test)*1.0)}% positive")

Train set has total 180000 with 49.97611111111111% negative and 50.02388888888889% positive
Validation set has total 10000 with 50.22% negative and 49.78% positive
Test set has total 10000 with 50.21% negative and 49.79% positive


In [20]:
tvec = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))
tvec.fit(x_train)

In [21]:
x_train_tfidf = tvec.transform(x_train)

In [22]:
x_validation_tfidf = tvec.transform(x_validation).toarray()

In [23]:
%%time
clf = LogisticRegression()
clf.fit(x_train_tfidf, y_train)

CPU times: user 23.1 s, sys: 2.66 s, total: 25.8 s
Wall time: 4.12 s


In [24]:
clf.score(x_validation_tfidf, y_validation)

0.8117

In [25]:
clf.score(x_train_tfidf, y_train)

0.8613666666666666

In [26]:
y_validation.shape

(10000,)

In [27]:
def batch_generator(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch//batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = np.array(y_data[y_data.index[index_batch]])
        counter += 1
        yield X_batch, y_batch
        if (counter >= number_of_batches):
            counter=0

In [28]:
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=100000))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                6400064   
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 6400129 (24.41 MB)
Trainable params: 6400129 (24.41 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [30]:
model.output

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dense_1')>

In [31]:
model.fit_generator(generator=batch_generator(x_train_tfidf, y_train, 32),
                    epochs=5, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]//32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fe6864d9280>

### With dropout layer

In [32]:
model_dropout = Sequential()
model_dropout.add(Dense(64, activation='relu', input_dim=100000))
model_dropout.add(Dropout(0.2))
model_dropout.add(Dense(1, activation='sigmoid'))
model_dropout.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_dropout.fit_generator(generator=batch_generator(x_train_tfidf, y_train, 32),
                    epochs=5, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fe6a063ea90>

### With data shuffling

In [35]:
netfdef batch_generator_shuffle(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch//batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    np.random.shuffle(index)
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = np.array(y_data[y_data.index[index_batch]])
        counter += 1
        yield X_batch,y_batch
        if (counter > number_of_batches):
            np.random.shuffle(index)
            counter=0
            
model_s = Sequential()
model_s.add(Dense(64, activation='relu', input_dim=100000))
model_s.add(Dense(1, activation='sigmoid'))
model_s.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_train, 32),
                    epochs=5, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]//32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fe68bb30460>

### Increasing number of nodes in hidden layer

In [36]:
model_s_more_nodes = Sequential()
model_s_more_nodes.add(Dense(128, activation='relu', input_dim=100000))
model_s_more_nodes.add(Dense(1, activation='sigmoid'))
model_s_more_nodes.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s_more_nodes.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_train, 32),
                    epochs=3, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]//32)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7fe6a0e9a040>

#### We can look at Word2Vec and Doc2Vec as possible ways for semantic analysis and training a neural network.

#### Some other ways to do it would be to use CNN and other famous neural network architecture.