In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers import Embedding
from keras.preprocessing import sequence
import random
SEED = 2000
np_seed = 7
np.random.seed(np_seed)

In [None]:
# for google colab

from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv("/content/drive/MyDrive/clean_tweet.csv", index_col=0)
data1 = data.head(10000)
data2 = data.tail(10000)

In [3]:
data = pd.read_csv("../data/clean_tweet.csv", index_col=0)
data.head()

Unnamed: 0,text,target
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many times for the ball managed to save ...,0
3,my whole body feels itchy and like its on fire,0
4,no it not behaving at all mad why am here beca...,0


In [4]:
x = data.text
y = data.target

In [7]:
x_train, x_validation_test, y_train, y_validation_test = train_test_split(x,y, test_size=0.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_test, y_validation_test, test_size=0.5, random_state=SEED)

In [8]:
print(f"Train set has total {len(x_train)} with {len(x_train[y_train==0])*100/(len(x_train)*1.0)}% negative and {len(x_train[y_train==1])*100/(len(x_train)*1.0)}% positive")
print(f"Validation set has total {len(x_validation)} with {len(x_validation[y_validation==0])*100/(len(x_validation)*1.0)}% negative and {len(x_validation[y_validation==1])*100/(len(x_validation)*1.0)}% positive")
print(f"Test set has total {len(x_test)} with {len(x_test[y_test==0])*100/(len(x_test)*1.0)}% negative and {len(x_test[y_test==1])*100/(len(x_test)*1.0)}% positive")

Train set has total 1564120 with 50.020139119760636% negative and 49.979860880239364% positive
Validation set has total 15960 with 49.454887218045116% negative and 50.545112781954884% positive
Test set has total 15961 with 49.67733851262452% negative and 50.32266148737548% positive


In [10]:
tvec = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))
tvec.fit(x_train)

In [11]:
x_train_tfidf = tvec.transform(x_train)

In [12]:
x_validation_tfidf = tvec.transform(x_validation).toarray()

In [16]:
%%time
clf = LogisticRegression()
clf.fit(x_train_tfidf, y_train)

CPU times: user 2min 13s, sys: 9.25 s, total: 2min 22s
Wall time: 32.7 s


In [17]:
clf.score(x_validation_tfidf, y_validation)

0.8218671679197995

In [18]:
clf.score(x_train_tfidf, y_train)

0.8322820499705905

In [67]:
y_validation.shape

(15960,)

In [104]:
def batch_generator(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch//batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = np.array(y_data[y_data.index[index_batch]])
        counter += 1
        yield X_batch, y_batch
        if (counter >= number_of_batches):
            counter=0

In [105]:
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=100000))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [106]:
model.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_28 (Dense)            (None, 64)                6400064   
                                                                 
 dense_29 (Dense)            (None, 1)                 65        
                                                                 
Total params: 6400129 (24.41 MB)
Trainable params: 6400129 (24.41 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [107]:
model.output

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dense_29')>

In [108]:
fraction = 0.001
number_to_choose = int(fraction * x_train_tfidf.shape[0]) 
random_numbers = np.random.randint(0, x_train_tfidf.shape[0], size=number_to_choose)  # size=k specifies the number of numbers

x_train_subset = x_train_tfidf[random_numbers,:]
y_train_subset = y_train.iloc[random_numbers]

In [109]:
model.fit_generator(generator=batch_generator(x_train_subset, y_train_subset, 32),
                    epochs=5, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_subset.shape[0]//32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7feda4668cd0>