# Preprocessing of data and preparation of our GloVe vectorized datasets.


## Import useful libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import string
import re
import nltk
import pickle

from gensim.utils import simple_preprocess
from gensim import corpora

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics import recall_score,f1_score,precision_score,classification_report,roc_curve,auc
from sklearn.model_selection import cross_val_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from google.colab import drive
drive.mount('/content/drive/')
nltk.download('wordnet')

Mounted at /content/drive/
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Retrieve and clean our dataset.

### Retrieve and display our dataset!

In [None]:
data = pd.read_csv(r'/content/drive/My Drive/AI(II)/datasets/SentimentTweets.csv',nrows=500000)

X = pd.DataFrame(data['text'],columns=['text'])
Y = pd.DataFrame(data['target'],columns=['target'])

display(X,Y)

Unnamed: 0,text
0,#brokenpromises...
1,David Carradine so sad. Thai's law not sure i...
2,A @ 415 B @ 425. Tell your bro i say congrats!
3,@littlefluffycat Indeed.
4,Completed Race 4 Life in 58mins with girlies f...
...,...
499995,@SkyNews they've finally managed to kill F1!
499996,@elvensapphire I'm brazilian so brazilian are ...
499997,@ScherziNicole Aww thats so sweet! I joined a ...
499998,Woke up kinda depressed and sad


Unnamed: 0,target
0,0
1,0
2,4
3,4
4,4
...,...
499995,0
499996,0
499997,4
499998,0


### Convert to lowercase.

In [None]:
X['text'] = X['text'].str.lower()
display(X)

Unnamed: 0,text
0,#brokenpromises...
1,david carradine so sad. thai's law not sure i...
2,a @ 415 b @ 425. tell your bro i say congrats!
3,@littlefluffycat indeed.
4,completed race 4 life in 58mins with girlies f...
...,...
499995,@skynews they've finally managed to kill f1!
499996,@elvensapphire i'm brazilian so brazilian are ...
499997,@scherzinicole aww thats so sweet! i joined a ...
499998,woke up kinda depressed and sad


### Replace where is 4 to 1 (binary form).

In [None]:
Y = Y.replace(4,1)
display(Y)
print(Y.value_counts())

Unnamed: 0,target
0,0
1,0
2,1
3,1
4,1
...,...
499995,0
499996,0
499997,1
499998,0


target
1         250800
0         249200
dtype: int64


### Removal of special characters and punctuation.

In [None]:
def remove_special_characters(text, remove_digits=True):
  pattern = r'[^a-zA-z0-9\s]'
  text = re.sub(pattern,'',text)
  return text

X['text'] = X['text'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

X['text'] = X['text'].apply(remove_special_characters)

display(X)

Unnamed: 0,text
0,brokenpromises
1,david carradine so sad thais law not sure if ...
2,a 415 b 425 tell your bro i say congrats
3,littlefluffycat indeed
4,completed race 4 life in 58mins with girlies f...
...,...
499995,skynews theyve finally managed to kill f1
499996,elvensapphire im brazilian so brazilian are an...
499997,scherzinicole aww thats so sweet i joined a wh...
499998,woke up kinda depressed and sad


### Tokenization.

In [None]:
X['text'] = [simple_preprocess(line, deacc=True) for line in X['text']] 

display(X)

Unnamed: 0,text
0,[brokenpromises]
1,"[david, carradine, so, sad, thais, law, not, s..."
2,"[tell, your, bro, say, congrats]"
3,"[littlefluffycat, indeed]"
4,"[completed, race, life, in, mins, with, girlie..."
...,...
499995,"[skynews, theyve, finally, managed, to, kill]"
499996,"[elvensapphire, im, brazilian, so, brazilian, ..."
499997,"[scherzinicole, aww, thats, so, sweet, joined,..."
499998,"[woke, up, kinda, depressed, and, sad]"


### Lemmatization.

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(w, pos="v") for w in text])

X['text'] = X['text'].apply(lemmatize_text).copy()

display(X)

Unnamed: 0,text
0,brokenpromises
1,david carradine so sad thais law not sure if i...
2,tell your bro say congrats
3,littlefluffycat indeed
4,complete race life in mins with girlies from w...
...,...
499995,skynews theyve finally manage to kill
499996,elvensapphire im brazilian so brazilian be any...
499997,scherzinicole aww thats so sweet join while ag...
499998,wake up kinda depress and sad


### Split dataset to train,validation and test dataset.


In [None]:
# Split dataset to train and test set.
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.005,random_state=42)

# Split train dataset to train and validation set.
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.005,random_state=42)

print("X_train's shape:", X_train.shape)
print("Y_train's shape:", Y_train.shape)
print("X_val's shape:", X_val.shape)
print("Y_val's shape:", Y_val.shape)
print("X_test's shape:", X_test.shape)
print("Y_test's shape:", Y_test.shape)

X_train's shape: (495012, 1)
Y_train's shape: (495012, 1)
X_val's shape: (2488, 1)
Y_val's shape: (2488, 1)
X_test's shape: (2500, 1)
Y_test's shape: (2500, 1)


>We choosed to split dataset 99%-1% (train and test set), cause our dataset is too large. 


## Save Train, Val, Test sets.

In [None]:
# Save glove_X_train
x_train_file = open('/content/drive/My Drive/AI(II)/prepared_datasets/X_train.pkl', 'wb')
pickle.dump(X_train,x_train_file)
x_train_file.close()

# Save glove_X_val
x_val_file = open('/content/drive/My Drive/AI(II)/prepared_datasets/X_val.pkl', 'wb')
pickle.dump(X_val,x_val_file)
x_val_file.close()

# Save glove_X_test
x_test_file = open('/content/drive/My Drive/AI(II)/prepared_datasets/X_test.pkl', 'wb')
pickle.dump(X_test,x_test_file)
x_test_file.close()

# Save glove_Y_train
y_train_file = open('/content/drive/My Drive/AI(II)/prepared_datasets/Y_train.pkl', 'wb')
pickle.dump(Y_train,y_train_file)
y_train_file.close()

# Save glove_Y_val
y_val_file = open('/content/drive/My Drive/AI(II)/prepared_datasets/Y_val.pkl', 'wb')
pickle.dump(Y_val,y_val_file)
y_val_file.close()

# Save glove_Y_train
y_test_file = open('/content/drive/My Drive/AI(II)/prepared_datasets/Y_test.pkl', 'wb')
pickle.dump(Y_test,y_test_file)
y_test_file.close()