### Task 1:Explain the pipeline for developing sentiment analysis task. 

### Task 2:Perform cleaning and preprocessing of text. 

In [2]:
import pandas as pd
df = pd.read_csv(r'labeled_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
df.isnull().sum()

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

In [4]:
df = df[['tweet','class']]
df.head()

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,2
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1


# TEXT PREPROCESSING

### Convert text to lower case

In [5]:
df['clean_text'] = df['tweet'].str.lower()
df.head()

Unnamed: 0,tweet,class,clean_text
0,!!! RT @mayasolovely: As a woman you shouldn't...,2,!!! rt @mayasolovely: as a woman you shouldn't...
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,!!!!! rt @mleew17: boy dats cold...tyga dwn ba...
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,!!!!!!! rt @urkindofbrand dawg!!!! rt @80sbaby...
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,!!!!!!!!! rt @c_g_anderson: @viva_based she lo...
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,!!!!!!!!!!!!! rt @shenikaroberts: the shit you...


### Removing all characters that are not letters or spaces (Removing punctuations as well)

In [6]:
import re

# Variable to replace all characters that are not letters or whitespace
regex = re.compile('[^a-z\s]')
# Removes all characters that are not letters or spaces
df['clean_text'] = df['clean_text'].apply(lambda x: regex.sub('', x))
df.head()

Unnamed: 0,tweet,class,clean_text
0,!!! RT @mayasolovely: As a woman you shouldn't...,2,rt mayasolovely as a woman you shouldnt compl...
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,rt mleew boy dats coldtyga dwn bad for cuffin...
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,rt urkindofbrand dawg rt sbabylife you ever f...
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,rt cganderson vivabased she look like a tranny
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,rt shenikaroberts the shit you hear about me ...


### Remove words with less than 3 characters

In [7]:
df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 2]))
df.head()

Unnamed: 0,tweet,class,clean_text
0,!!! RT @mayasolovely: As a woman you shouldn't...,2,mayasolovely woman you shouldnt complain about...
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,mleew boy dats coldtyga dwn bad for cuffin dat...
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,urkindofbrand dawg sbabylife you ever fuck bit...
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,cganderson vivabased she look like tranny
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,shenikaroberts the shit you hear about might t...


### Tokenization

In [9]:
import nltk
#nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ai_ds_b2/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
from nltk.tokenize import word_tokenize
df['text_without_stopwords'] = df['clean_text'].apply(word_tokenize)

### Removing Stop Words

In [11]:
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ai_ds_b2/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['text_without_stopwords'] = df['text_without_stopwords'].apply(lambda x: [word for word in x if word not in stop_words])
df.head()

Unnamed: 0,tweet,class,clean_text,text_without_stopwords
0,!!! RT @mayasolovely: As a woman you shouldn't...,2,mayasolovely woman you shouldnt complain about...,"[mayasolovely, woman, shouldnt, complain, clea..."
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,mleew boy dats coldtyga dwn bad for cuffin dat...,"[mleew, boy, dats, coldtyga, dwn, bad, cuffin,..."
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,urkindofbrand dawg sbabylife you ever fuck bit...,"[urkindofbrand, dawg, sbabylife, ever, fuck, b..."
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,cganderson vivabased she look like tranny,"[cganderson, vivabased, look, like, tranny]"
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,shenikaroberts the shit you hear about might t...,"[shenikaroberts, shit, hear, might, true, migh..."


### Word Normalization

In [14]:
#nltk.download('wordnet')

In [15]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['normalized_text'] = df['text_without_stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df.head()

Unnamed: 0,tweet,class,clean_text,text_without_stopwords,normalized_text
0,!!! RT @mayasolovely: As a woman you shouldn't...,2,mayasolovely woman you shouldnt complain about...,"[mayasolovely, woman, shouldnt, complain, clea...","[mayasolovely, woman, shouldnt, complain, clea..."
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,mleew boy dats coldtyga dwn bad for cuffin dat...,"[mleew, boy, dats, coldtyga, dwn, bad, cuffin,...","[mleew, boy, dat, coldtyga, dwn, bad, cuffin, ..."
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,urkindofbrand dawg sbabylife you ever fuck bit...,"[urkindofbrand, dawg, sbabylife, ever, fuck, b...","[urkindofbrand, dawg, sbabylife, ever, fuck, b..."
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,cganderson vivabased she look like tranny,"[cganderson, vivabased, look, like, tranny]","[cganderson, vivabased, look, like, tranny]"
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,shenikaroberts the shit you hear about might t...,"[shenikaroberts, shit, hear, might, true, migh...","[shenikaroberts, shit, hear, might, true, migh..."


In [2]:
#pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[K     |████████████████████████████████| 26.6 MB 414 kB/s eta 0:00:01
Collecting smart-open>=1.8.1
  Downloading smart_open-6.4.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 594 kB/s eta 0:00:01
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.3.2 smart-open-6.4.0
Note: you may need to restart the kernel to use updated packages.


In [18]:
from gensim.models import Word2Vec

word2vec = Word2Vec(sentences=df['normalized_text'], vector_size=100, window=5, min_count=1, workers=4)

In [20]:
word2vec.wv

<gensim.models.keyedvectors.KeyedVectors at 0x7f4e759df430>

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(word2vec.wv, df['class'], test_size = 0.3)

ValueError: Found input variables with inconsistent numbers of samples: [34128, 24783]

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
model_word2vec= MultinomialNB().fit(x_train_vectorizer, y_train)
prediction_vectorizer=model_vectorizer.predict(x_test_vectorizer)
print(confusion_matrix(y_test,prediction_vectorizer))
print (classification_report(y_test, prediction_vectorizer))