# Sentiment Analysis Using ML-Agorithms with Data Augumentation
## What is NLP?
> Natural Language Processing is also abbrevated as NLP.Whatever we speakor write is understandable to a human,which becomes very difficult for a computer to decode from the text/speech that we write/speak. To make it a computer understanable,we basically process the texts into a number, as we know computer can understand only the number,such that we can apply this in the machine learning Algorithms.  
![NLP](https://venturebeat.com/wp-content/uploads/2018/09/natural-language-processing-e1572968977211.jpg?fit=578%2C289&strip=all)

![source](https://cdn-images-1.medium.com/max/1000/1*Uf_qQ0zF8G8y9zUhndA08w.png)
> The above shows the venn diagram of NLP

#### Import Library

In [None]:
pip install nlpaug

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
import textblob
from textblob import TextBlob, Word
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#### read the dataset using read_csv() of pandas library

In [None]:
traindata=pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv')
testdata=pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/test.csv')

In [None]:
traindata.head()

In [None]:
testdata.head()

In [None]:
traindata['label'].value_counts()

it is unbalanced data as we could see there is only 7% which accounts positive class while rest are negative class

In [None]:
%matplotlib inline
sns.countplot(traindata['label'])
plt.title('Class-Distribution')

## Data Augmentation

#### What is Data Augumentation?

> Data augumentation is a technique to overcome the imbalance in the target label of the dataset.Most of us must have augumented image data either by rotating the image,zooming,adding noises,etc, By doing these we basically increase the data.For images, there is a class in keras (ImageDatagenerator()) which helps to produce new images. For the text data, I have used nlpaug library.for better understandin you could read the blog
>>https://towardsdatascience.com/data-augmentation-library-for-text-9661736b13ff


In [None]:
import nlpaug.augmenter.sentence as nas


You can give your own api key

In [None]:
WANDB_API_KEY='sonu'

In [None]:
aug = nas.ContextualWordEmbsForSentenceAug(model_path='xlnet-base-cased')

In [None]:
text='am not interested in linguistics that does not address race racism is about power raciolinguistics brings'
augmented_texts = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)


In [None]:
augmented_texts

In [None]:
ls=[]
def data_augument(df):
    augmented_texts = aug.augment(df, n=10)
    for i in augmented_texts:
        ls.append(i)
    return(augmented_texts)


In [None]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

In [None]:
traindata[traindata['label']==1]['tweet'].progress_apply(data_augument)

In [None]:
array=np.array(ls)

In [None]:
np.save('array',array)


In [None]:
augmented_texts=np.load('../input/array-file/array.npy')

In [None]:
aug_data=pd.DataFrame(augmented_texts,columns=['tweet'])

In [None]:
aug_data['label']=1

In [None]:
aug_data.head()

In [None]:
traindata=pd.concat([traindata.drop(columns=['id']), aug_data], join="outer").sample(frac=1).reset_index(drop=True)

In [None]:
traindata.head()

After augmnetation the dataset is now balanced

In [None]:

plt.pie(traindata['label'].value_counts(), autopct='%1.1f%%', shadow=True,labels=['Negative Class','Positive Class'])
plt.title('Class Distribution');
plt.show()

In [None]:
traindata['preclean_no_words']=  [len(t) for t in traindata.tweet]
sns.boxplot(traindata.preclean_no_words)

In [None]:
#traindata['preclean_no_words']=  [len(t) for t in traindata.tweet]
sns.boxplot(traindata[traindata['label']==0].preclean_no_words)

In [None]:
sns.boxplot(traindata[traindata['label']==1].preclean_no_words)

In [None]:
traindata['no_of_characters']=traindata['tweet'].str.len()
traindata.head()

In [None]:
%matplotlib inline
plt.figure(figsize=(8,8))
sns.distplot(traindata['no_of_characters'])

In [None]:
traindata['no_of_words']=traindata['tweet'].apply(lambda x: len(str(x).split(" ")))
traindata.head()

In [None]:
plt.figure(figsize=(8,8))
sns.distplot(traindata['no_of_words'])

In [None]:
print(traindata['no_of_characters'].max(),"Max'm of all characters")
print(traindata['no_of_words'].max(),"Max'm of all words")


In [None]:
traindata['no_of_hash']=traindata['tweet'].apply(lambda x:len([x for x in x.split() if x.startswith('#')]))
traindata.head()

In [None]:
testdata['no_of_hash']=testdata['tweet'].apply(lambda x:len([x for x in x.split() if x.startswith('#')]))
testdata.head()

In [None]:
plt.figure(figsize=(8,8))
sns.distplot(traindata['no_of_hash'])

In [None]:
traindata['no_of_digits']=traindata['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
traindata.head()

In [None]:
testdata['no_of_digits']=testdata['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
testdata.head()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(traindata['no_of_digits'])

#### Text Pre-Processing

In [None]:
train_process=traindata.copy()

#### Lower Case

In [None]:
train_process['tweet_lowercase'] = train_process['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train_process[['tweet', 'tweet_lowercase']].tail()

In [None]:
testdata['tweet_lowercase'] = testdata['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
testdata[['tweet', 'tweet_lowercase']].tail()

#### Remove the stopwords

In [None]:
stop_words = stopwords.words('english')+['0624',
 '07800',
 '07950',
 '08a',
 '100',
 '1000',
 '1000x',
 '100k',
 '101',
 '106',
 '10alltypespos',
 '10am',
 '10days',
 '10k',
 '10th',
 '1117',
 '11400',
 '11th',
 '1200',
 '123',
 '12313',
 '1299',
 '12mill',
 '13479',
 '13th',
 '13thdocumentary',
 '140',
 '14000',
 '14200',
 '142017',
 '148',
 '1499',
 '14th',
 '1500',
 '15000',
 '150516',
 '15thcentury',
 '160',
 '1600',
 '1625',
 '17th',
 '180',
 '18th',
 '1900',
 '190k',
 '1930s',
 '1960',
 '1968',
 '1970',
 '1980',
 '1996',
 '1999',
 '19th',
 '1gabba',
 '1pun',
 '1st',
 '1stammendment',
 '2',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2006',
 '2008',
 '2009',
 '200k',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '201617',
 '2016a',
 '2016election',
 '2016highlights',
 '2016ia',
 '2016in4a',
 '2016in4worda',
 '2016in4words',
 '2016in4wordsa',
 '2016in4worlds',
 '2016ina',
 '2016release',
 '2017',
 '2017fail',
 '2017in3words',
 '2017in3wordsa',
 '2017npr',
 '2018',
 '201a',
 '20days',
 '20th',
 '2100',
 '21st',
 '230pmet',
 '23rd',
 '247',
 '24h',
 '24hrs',
 '24th',
 '25th',
 '26th',
 '280',
 '299',
 '2a',
 '2day',
 '2days',
 '2i',
 '2ia1',
 '2k16',
 '2nd',
 '2nites',
 '2nnð3',
 '2pac',
 '2pm',
 '2raise',
 '2stand',
 '2the',
 '2ð',
 '2ðn',
 '2ðð',
 '2ðð1',
 '2ððð1',
 '2ðððð1',
 '30th',
 '342',
 '350',
 '35th',
 '360',
 '38billion',
 '399',
 '3rd',
 '4',
 '400',
 '400000',
 '40404',
 '41',
 '4a',
 '4a1',
 '4aa1',
 '4ai',
 '4aið',
 '4ejapan',
 '4i',
 '4i1',
 '4maps',
 '4nð3',
 '4o3',
 '4o4o4',
 '4pm',
 '4sa',
 '4th',
 '4wd',
 '4æa',
 '4ð',
 '4ð1',
 '4ð3',
 '4ðað',
 '4ðo3',
 '4ðð',
 '4ðð1',
 '4ðð3',
 '4ððð',
 '4ððð1',
 '4ððð3',
 '4ðððð',
 '4ðððμð1',
 '4μ',
 '4μo3',
 '50',
 '500',
 '50islamicinfo',
 '50th',
 '564943',
 '5hrs',
 '5sos',
 '5th',
 '5wordtrumplethinskin',
 '60',
 '600',
 '60minutes',
 '630',
 '6417153640',
 '642',
 '6pm',
 '6th',
 '6yearolds',
 '70',
 '700',
 '700000',
 '703',
 '799',
 '800',
 '80snostalgia',
 '80yrold',
 '8990',
 '8pm',
 '8th',
 '900',
 '90th',
 '911',
 '940pm',
 '946',
 '952',
 '999',
 '99c',
 '99c99p',
 '99c99pa',
 '99ca',
 '99cents',
 '99p',
 '9am',
 '9pm',
 '9th',
 '__luicalibre__s',
 '_animaladvocate',
 '_øuøu',
 'a1',
 'a15',
 'a17',
 'a1aaaa',
 'a1i',
 'aa1',
 'aaa',
 'aaaa',
 'aaaaa',
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
 'aaahis',
 'aaall',
 'aaaplay',
 'aampe',
 'aand',
 'aande',
 'aanne',
 'aantiislamista',
 'aap',
 'aape',
 'aaron',
 'abandoned',
 'abba',
 'abc',
 'abd',
 'abe',
 'abeed']


In [None]:
stop_words

In [None]:
train_process['tweet_stopwords'] = train_process['tweet_lowercase'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
train_process[['tweet_stopwords','tweet']].head()


In [None]:
testdata['tweet_stopwords'] = testdata['tweet_lowercase'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
testdata[['tweet_stopwords','tweet']].head()

#### Remove the punctuations

In [None]:
train_process['tweet_punctuation'] = train_process['tweet_stopwords'].str.replace('[^\w\s]', '')
train_process[['tweet', 'tweet_punctuation']].head()


In [None]:
testdata['tweet_punctuation'] = testdata['tweet_stopwords'].str.replace('[^\w\s]', '')
testdata[['tweet', 'tweet_punctuation']].head()

#### Remove the single letter present

In [None]:
train_process['tweet_single_letter']=train_process['tweet_punctuation'].apply(lambda words: ' '.join( [w for w in words.split() if len(w)>2] ))
train_process[['tweet_single_letter','tweet']].head()

In [None]:
testdata['tweet_single_letter']=testdata['tweet_punctuation'].apply(lambda words: ' '.join( [w for w in words.split() if len(w)>2] ))
testdata[['tweet_single_letter','tweet']].head()

#### Top 25 and Least 25 words shown

In [None]:
print('top 25 used words')
print('-----------------')
print(pd.Series(''.join(train_process['tweet_single_letter']).split()).value_counts()[0:25])


In [None]:
print('least 25 used words')
print('-------------------')
print(pd.Series(''.join(train_process['tweet_single_letter']).split()).value_counts()[-25:])

In [None]:
##train_process['tweet_correct']=train_process['tweet_single_letter'].progress_apply(lambda x: str(TextBlob(x).correct()))


#### Lemmatize the sentences

In [None]:
train_process['tweet_lemma']=train_process['tweet_single_letter'].progress_apply(lambda words: ' '.join( [WordNetLemmatizer().lemmatize(w) for w in words.split()]))
train_process[['tweet_lemma','tweet']].head()

In [None]:
testdata['tweet_lemma']=testdata['tweet_single_letter'].progress_apply(lambda words: ' '.join( [WordNetLemmatizer().lemmatize(w) for w in words.split()]))
testdata[['tweet_lemma','tweet']].head()

#### Train-Test Split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(train_process['tweet_lemma'],train_process['label'],test_size=0.33,random_state=42)

#### TF-IDF
>> TF-IDF (term frequency-inverse document frequency) is a statistical measure that evaluates how relevant a word is to a document in a collection of documents. This is done by multiplying two metrics: how many times a word appears in a document, and the inverse document frequency of the word across a set of documents.

In [None]:
feature = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1,1), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')


In [None]:
feature.fit(X_train) 
x_train=feature.transform(X_train)

In [None]:
testdata=feature.transform(testdata['tweet_lemma'])

In [None]:
x_test=feature.transform(X_test)

In [None]:
x_train.toarray()

In [None]:
#feature.get_feature_names()

#### the evaluation metric is F-1 score

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
import time

In [None]:
param_grid = {'C': np.arange(20,30,2),
              'max_iter': np.arange(100,1200,100),
              'penalty': ['l1','l2']}

In [None]:
kf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True)

i=1


for train_index,test_index in kf.split(x_train,y_train):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train[train_index],x_train[test_index]
    ytr,yvl = y_train.iloc[train_index],y_train.iloc[test_index]
    #print(train_index)
    model = RandomizedSearchCV(estimator=LogisticRegression(class_weight='balanced'),param_distributions=param_grid,verbose=1)
    

    model.fit(xtr, ytr)
    #print (model.best_params_)
    pred=model.predict(xvl)
    print('roc_auc_score',roc_auc_score(yvl,pred))
    i+=1

In [None]:
print ('best parameters',model.best_params_)

In [None]:
roc_auc_logistic = roc_auc_score(yvl,pred).mean()
f1_logistic = f1_score(yvl,pred).mean()
print('Mean - ROC AUC', roc_auc_logistic)
print('F1 Score - ', f1_logistic)
print('Confusion Matrix \n',confusion_matrix(yvl,pred))


In [None]:
import warnings
warnings.filterwarnings('ignore')

## Decision Tree Classifier

In [None]:
#DecisionTree with tuned hyperparameters
from sklearn.tree import DecisionTreeClassifier
start_time = time.time()
param_grid = {'criterion': ['gini','entropy'],
             'min_samples_split':[50,70,100,150],
             'max_features': ['sqrt','log2']}


i=1
kf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True)
for train_index,test_index in kf.split(x_train,y_train):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train[train_index],x_train[test_index]
    ytr,yvl = y_train.iloc[train_index],y_train.iloc[test_index]
    
    model = RandomizedSearchCV(estimator=DecisionTreeClassifier(class_weight={0:1,1:5}),param_distributions=param_grid,verbose=1)
    

    model.fit(xtr, ytr)
    #print (model.best_params_)
    pred=model.predict(xvl)
    print('roc_auc_score',roc_auc_score(yvl,pred))
    i+=1

print("Execution time: " + str((time.time() - start_time)) + ' ms')
print ('best parameters',model.best_params_)


In [None]:
#Model Accuracy
roc_auc_dt = roc_auc_score(yvl,pred).mean()
f1_dt = f1_score(yvl,pred).mean()
print('Mean - ROC AUC', roc_auc_dt)
print('F1 Score - ', f1_dt)
print('Confusion Matrix \n',confusion_matrix(yvl,pred))

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
start_time = time.time()
param_grid = {'criterion': ['entropy'],
             'min_samples_split':np.arange(10,100,20),
             'max_features': ['sqrt'],
             'n_estimators':[10,20,30]}

i=1
kf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True)
for train_index,test_index in kf.split(x_train,y_train):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = x_train[train_index],x_train[test_index]
    ytr,yvl = y_train.iloc[train_index],y_train.iloc[test_index]
    
    model = RandomizedSearchCV(estimator=RandomForestClassifier(),param_distributions=param_grid,verbose=1)
    

    model.fit(xtr, ytr)
    #print (model.best_params_)
    pred=model.predict(xvl)
    print('roc_auc_score',roc_auc_score(yvl,pred))
    i+=1

print("Execution time: " + str((time.time() - start_time)) + ' ms')
print ('best parameters',model.best_params_)

In [None]:
#Model Accuracy
roc_auc_rf = roc_auc_score(yvl,pred).mean()
f1_rf = f1_score(yvl,pred).mean()
print('Mean - ROC AUC', roc_auc_rf)
print('F1 Score - ', f1_rf)
print('Confusion Matrix \n',confusion_matrix(yvl,pred))

In [None]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest'],
    'Mean - ROC AUC Score (Fold=10)': [roc_auc_logistic, roc_auc_dt, roc_auc_rf],
    'Mean - F1 Score': [f1_logistic,f1_dt,f1_rf]})

In [None]:
results.head()

In [None]:
logistic=LogisticRegression(penalty='l2',max_iter=100,C=28)

In [None]:
testdata

In [None]:
logistic.fit(x_train,y_train)
pred=logistic.predict(testdata.toarray())

In [None]:
sub=pd.DataFrame(testdata['id'])

In [None]:
sub['label']=pred

In [None]:
sub.to_csv('sub.csv')

Hope you liked it!!
DO upvote it!!