# Acknowledgement:
### Notebook inspired from following sources:
1. https://towardsdatascience.com/sentiment-analysis-with-text-mining-13dd2b33de27
2. https://www.youtube.com/watch?v=ujId4ipkBio
3. https://stackoverflow.com/questions/43018030/


In [529]:
import pandas as pd
import numpy as np  

# Importing Data


In [530]:
train = pd.read_csv("train.csv", encoding='ISO-8859-1')
test = pd.read_csv("test.csv", encoding='ISO-8859-1')

In [531]:
# Understanding the data
print(train.head())
print(test.head())

   Id                                               text  Target
0   1  @USAirways  ! THE WORST in customer service. @...      -1
1   2  @united call wait times are over 20 minutes an...      -1
2   3  @JetBlue what's up with the random delay on fl...      -1
3   4  @AmericanAir Good morning!  Wondering why my p...       0
4   5  @united UA 746. Pacific Rim and Date Night cut...      -1
     id                                               text
0  7322  @AmericanAir In car gng to DFW. Pulled over 1h...
1  7323  @AmericanAir after all, the plane didnÂÃÂªt ...
2  7324  @SouthwestAir can't believe how many paying cu...
3  7325  @USAirways I can legitimately say that I would...
4  7326  @AmericanAir still no response from AA. great ...


In [532]:
# Shape of train and test data
print("Train data:", train.shape)
print("Test data:", test.shape)

Train data: (7320, 3)
Test data: (7320, 2)


# Data Cleaning and Preprocessing

In [534]:
# Replacing short words 
shortwords_dict = {
    "doesn't" : "does not",
    "didn't" : "did not",
    "don't" : "do not",
    "can't" : "can not",
    "couldn't" : "could not",
    "could've" : "could have",
    "aren't" : "are not",
    "ain't" : "is not",
    "isn't" : "is not",
    "it's" : "it is",
    "i'll" : "i will",
    "i'd" : "i would",
    "i've" : "i have",
    "i'm" : "i am",
    "he's" : "he is",
    "he'll" : "he will",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "how's" : "how is",
    "she'll" : "she will",
    "she's" : "she is",
    "should've" : "should have",
    "shouldn't" : "should not",
    "let's" : "let us",
    "that's" : "that is",
    "they're" : "they are",
    "they've" : "they have",
    "they'll" : "they will",
    "that'd" : "that would",
    "'cause" : "because",
    "ma'am" : "madam",
    "might've" : "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "we'd" : "we would",
    "we're" : "we are",
    "we've" : "we have",
    "we'll" : "we will",
    "weren't" : "were not",
    "what'll" : "what will",
    "what's" : "what is",
    "won't" : "will not",
    "wouldn't" : "would not",
    "would've" : "would have",
    "y'all" : "you all",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have"
}

def replacesw(text):
   tokens = text.split()
   replaced = []
   for token in tokens:
     replaced.append(shortwords_dict[token] if token in shortwords_dict else token)
   text = " ".join(word for word in replaced)
   return text

In [535]:
# Finding and replacing emoticons
from nltk.tokenize import TweetTokenizer
import regex as re 

emoticon = {
    ':D': 'laughface',
    ':-D': 'laughface',
    ':)': 'smileface',
    ':-)': 'smileface',
    ':(': 'sadface',
    ':-(': 'sadface',
    ':/': 'troubledface',
    ':-/': 'troubledface',
    ':|': 'straightface',
    ':-|': 'straightface',

}

def emoticonToWord(text):
  tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
  tokens = tokenizer.tokenize(text)
  replaced = []
  for token in tokens:
    replaced.append(emoticon[token] if token in emoticon else token)
  text = " ".join(word for word in replaced)
  return text

In [536]:
# Data Cleaning
def clean(text):
  # Replacing emoticons with text
  text = emoticonToWord(text)
  # Replacing short words like can't, i'd, etc
  text = replacesw(text)
  # Removing @ mentions
  text = re.sub(r'@\w+','',text)
  # Removing words starting with & i.e. &amp, &lt etc
  text = re.sub(r'&\w+','',text)
  # Replacing period with a space
  text = re.sub(r'\.',' ',text)
  # Replacing / with or
  text = re.sub(r'/',' or ',text)
  # Removing punctuations and special characters and # symbol at the beginning of word
  text = re.sub(r'[^A-Za-z0-9\s]','',text)
  # Removing hyperlinks
  text = re.sub(r'https?\w+','',text)
  # Removing double spaces
  text = re.sub(r'\s+',' ',text)
  # Removing digits
  text = re.sub(r'\d+','',text)
  # Converting text to lower case
  text = text.lower()

  return text

## Stemming 

In [538]:
from nltk.stem.porter import PorterStemmer 
stemmer = PorterStemmer()


In [539]:
train['text'] = train['text'].apply(lambda text: " ".join(stemmer.stem(token) for token in text.split()))
test['text'] = test['text'].apply(lambda text: " ".join(stemmer.stem(token) for token in text.split()))

In [540]:
# Printing train and test data
print("Train data")
print(train.head())
print("Test data")
print(test.head())

Train data
   Id                                               text  Target
0   1  the worst in custom servic call for over a mon...      -1
1   2  call wait time are over minut and airport wait...      -1
2   3  what is up with the random delay on flight ani...      -1
3   4  good morn wonder whi my pretsa check wa not on...       0
4   5  ua pacif rim and date night cut out not consta...      -1
Test data
     id                                               text
0  7322  in car gng to dfw pull over hr ago veri ici ro...
1  7323  after all the plane didn t land in ident or wo...
2  7324  can not believ how mani pay custom you left hi...
3  7325  i can legitim say that i would have rather dri...
4  7326             still no respons from aa great job guy


In [541]:
#from nltk.corpus import wordnet
#nltk.download('averaged_perceptron_tagger')
#def get_part_of_speech_tags(token):
#    tag_dict = {"J": wordnet.ADJ,
#                "N": wordnet.NOUN,
#                "V": wordnet.VERB,
#                "R": wordnet.ADV}
#    
#    tag = nltk.pos_tag([token])[0][1][0].upper()
#    
#    return tag_dict.get(tag, wordnet.NOUN)

In [542]:
#nltk.download('wordnet')
#from nltk.stem import WordNetLemmatizer 
#lemmatizer = WordNetLemmatizer()

In [482]:
#train['text'] = train['text'].apply(lambda sentence:" ".join(lemmatizer.lemmatize(word, get_part_of_speech_tags(word)) for word in sentence.lower().split()))
#test['text'] = test['text'].apply(lambda sentence:" ".join(lemmatizer.lemmatize(word, get_part_of_speech_tags(word)) for word in sentence.lower().split()))

# Vectorization and Machine Learning Models (Without Removing Stop Words)

### Count Vectorization

In [543]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import  classification_report

In [544]:
# Splitting the train data into train and test
X = train['text']
y = train['Target']

In [545]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

#### Naive Bayes


In [410]:
# Creating a pipeline
pipeline = Pipeline([
    ('v', CountVectorizer(strip_accents='ascii', max_df=0.5)),
    ('c', MultinomialNB(fit_prior=True, class_prior=None))
    ])

# Setting parameters
params = {
   # 'v__max_df': [0.3,0.5,0.8],
    'v__ngram_range': [(1,1),(1,2),(1,3),(1,4)],
    'c__alpha': [0.25,0.5,0.75]
}

gridSearchNB = GridSearchCV(pipeline,params,cv=10,n_jobs=-1, scoring="f1_macro")

In [411]:
gridSearchNB.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('v',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.5,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prepro

In [412]:
print("Best Parameters: ", gridSearchNB.best_params_)
print("Train Score: {:.2f}".format(gridSearchNB.score(X_train,y_train)))
print("Test Score: {:.2f}".format(gridSearchNB.score(X_test,y_test)))

Best Parameters:  {'c__alpha': 0.25, 'v__ngram_range': (1, 2)}
Train Score: 0.97
Test Score: 0.69


In [413]:
print(classification_report(y_train,gridSearchNB.predict(X_train)))
print(classification_report(y_test,gridSearchNB.predict(X_test)))

              precision    recall  f1-score   support

          -1       0.98      0.98      0.98      3634
           0       0.95      0.94      0.95      1241
           1       0.96      0.98      0.97       981

    accuracy                           0.97      5856
   macro avg       0.96      0.97      0.97      5856
weighted avg       0.97      0.97      0.97      5856

              precision    recall  f1-score   support

          -1       0.81      0.95      0.87       932
           0       0.66      0.42      0.51       295
           1       0.82      0.60      0.69       237

    accuracy                           0.79      1464
   macro avg       0.76      0.66      0.69      1464
weighted avg       0.78      0.79      0.77      1464



#### Logistic Regression

In [546]:
from sklearn.linear_model import LogisticRegression

In [547]:
# Creating a pipeline
pipeline = Pipeline([
    ('v', CountVectorizer(strip_accents='ascii', max_df =0.5)),
    ('lr', LogisticRegression(solver='lbfgs', max_iter=10000))
    ])

# Setting parameters
params = {
    'v__ngram_range': [(1,1),(1,2),(1,3),(1,4)],
    'lr__penalty': ['l1','l2'],
    'lr__C': [0.1, 0.25, 0.5, 1.0]
}

gridSearchLR = GridSearchCV(pipeline,params,cv=5,n_jobs=-1, scoring="f1_macro")

In [549]:
gridSearchLR.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('v',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.5,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        preproc

In [550]:
print("Best Parameters: ", gridSearchLR.best_params_)
print("Train Score: {:.2f}".format(gridSearchLR.score(X_train,y_train)))
print("Test Score: {:.2f}".format(gridSearchLR.score(X_test,y_test)))

Best Parameters:  {'lr__C': 1.0, 'lr__penalty': 'l2', 'v__ngram_range': (1, 2)}
Train Score: 0.99
Test Score: 0.75


In [551]:
print(classification_report(y_train,gridSearchLR.predict(X_train)))
print(classification_report(y_test,gridSearchLR.predict(X_test)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      3634
           0       0.99      0.99      0.99      1241
           1       0.99      0.99      0.99       981

    accuracy                           1.00      5856
   macro avg       0.99      0.99      0.99      5856
weighted avg       1.00      1.00      1.00      5856

              precision    recall  f1-score   support

          -1       0.85      0.92      0.89       932
           0       0.69      0.62      0.65       295
           1       0.81      0.65      0.72       237

    accuracy                           0.82      1464
   macro avg       0.79      0.73      0.75      1464
weighted avg       0.81      0.82      0.81      1464



#### XGBoost

In [486]:
from xgboost.sklearn import XGBClassifier

In [487]:
cv = CountVectorizer(strip_accents='ascii', max_df =0.5)
X_trainxgb = cv.fit_transform(X_train)
X_testxgb = cv.transform(X_test)

xgb = XGBClassifier(objective='multi:softmax', seed=0, missing=None, max_depth=9,early_stopping_rounds = 5)
xgb.fit(X_trainxgb,y_train)

print("Train score: {:.2f}".format(xgb.score(X_trainxgb,y_train)))
print("Test score: {:.2f}".format(xgb.score(X_testxgb,y_test)))

Train score: 0.89
Test score: 0.79


In [488]:
print(classification_report(y_train,xgb.predict(X_trainxgb)))
print(classification_report(y_test,xgb.predict(X_testxgb)))

              precision    recall  f1-score   support

          -1       0.91      0.96      0.94      3634
           0       0.82      0.80      0.81      1241
           1       0.92      0.78      0.84       981

    accuracy                           0.89      5856
   macro avg       0.88      0.85      0.86      5856
weighted avg       0.89      0.89      0.89      5856

              precision    recall  f1-score   support

          -1       0.82      0.92      0.87       932
           0       0.70      0.54      0.61       295
           1       0.75      0.60      0.67       237

    accuracy                           0.79      1464
   macro avg       0.76      0.69      0.72      1464
weighted avg       0.79      0.79      0.78      1464



## TFIDF

In [420]:
# Creating a pipeline
pipeline = Pipeline([
    ('v', TfidfVectorizer(norm=None, max_df=0.5)),
    ('c', MultinomialNB(fit_prior=True, class_prior=None))
    ])

# Setting parameters
params = {
    'v__ngram_range': [(1,1),(1,2),(1,3),(1,4)],
    'c__alpha': [0.25,0.5,0.75]
}

gridSearchNBtfidf = GridSearchCV(pipeline,params,cv=10,n_jobs=-1, scoring="f1_macro")

In [421]:
gridSearchNBtfidf.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('v',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.5,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        norm

In [422]:
print("Best Parameters: ", gridSearchNBtfidf.best_params_)
print("Train Score: {:.2f}".format(gridSearchNBtfidf.score(X_train,y_train)))
print("Test Score: {:.2f}".format(gridSearchNBtfidf.score(X_test,y_test)))

Best Parameters:  {'c__alpha': 0.75, 'v__ngram_range': (1, 2)}
Train Score: 0.98
Test Score: 0.69


In [423]:
print(classification_report(y_train,gridSearchNBtfidf.predict(X_train)))
print(classification_report(y_test,gridSearchNBtfidf.predict(X_test)))

              precision    recall  f1-score   support

          -1       1.00      0.99      0.99      3634
           0       0.97      0.97      0.97      1241
           1       0.97      0.99      0.98       981

    accuracy                           0.99      5856
   macro avg       0.98      0.99      0.98      5856
weighted avg       0.99      0.99      0.99      5856

              precision    recall  f1-score   support

          -1       0.83      0.90      0.87       932
           0       0.63      0.48      0.55       295
           1       0.68      0.65      0.67       237

    accuracy                           0.78      1464
   macro avg       0.71      0.68      0.69      1464
weighted avg       0.77      0.78      0.77      1464



#### Logistic Regression

In [424]:
# Creating a pipeline
pipeline = Pipeline([
    ('v', TfidfVectorizer(norm=None, max_df=0.5)),
    ('lr', LogisticRegression(solver='lbfgs'))
    ])

# Setting parameters
params = {
    'v__ngram_range': [(1,1),(1,2),(1,3),(1,4)],
    'lr__penalty': ['l1','l2'],
    'lr__C': [0.1, 0.25, 0.5, 1.0]
}

gridSearchLRtfidf = GridSearchCV(pipeline,params,cv=5,n_jobs=-1, scoring="f1_macro")

In [425]:
gridSearchLRtfidf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('v',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.5,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        norm=

In [426]:
print("Best Parameters: ", gridSearchLRtfidf.best_params_)
print("Train Score: {:.2f}".format(gridSearchLRtfidf.score(X_train,y_train)))
print("Test Score: {:.2f}".format(gridSearchLRtfidf.score(X_test,y_test)))

Best Parameters:  {'lr__C': 0.1, 'lr__penalty': 'l2', 'v__ngram_range': (1, 2)}
Train Score: 1.00
Test Score: 0.75


In [427]:
print(classification_report(y_train,gridSearchLRtfidf.predict(X_train)))
print(classification_report(y_test,gridSearchLRtfidf.predict(X_test)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      3634
           0       1.00      0.99      1.00      1241
           1       0.99      1.00      0.99       981

    accuracy                           1.00      5856
   macro avg       1.00      1.00      1.00      5856
weighted avg       1.00      1.00      1.00      5856

              precision    recall  f1-score   support

          -1       0.85      0.92      0.88       932
           0       0.68      0.59      0.63       295
           1       0.81      0.66      0.73       237

    accuracy                           0.81      1464
   macro avg       0.78      0.73      0.75      1464
weighted avg       0.81      0.81      0.81      1464



#### XGBoost

In [None]:
from xgboost.sklearn import XGBClassifier

In [489]:
tfidf = TfidfVectorizer(norm=None, max_df =0.5)
X_trainxgb = tfidf.fit_transform(X_train)
X_testxgb = tfidf.transform(X_test)

xgbtfidf = XGBClassifier(objective='multi:softmax', seed=0, missing=None, max_depth=9,early_stopping_rounds = 5)
xgbtfidf.fit(X_trainxgb,y_train)

print("Train score: {:.2f}".format(xgbtfidf.score(X_trainxgb,y_train)))
print("Test score: {:.2f}".format(xgbtfidf.score(X_testxgb,y_test)))

Train score: 0.89
Test score: 0.79


In [490]:
print(classification_report(y_train,xgbtfidf.predict(X_trainxgb)))
print(classification_report(y_test,xgbtfidf.predict(X_testxgb)))

              precision    recall  f1-score   support

          -1       0.91      0.96      0.94      3634
           0       0.82      0.80      0.81      1241
           1       0.92      0.78      0.84       981

    accuracy                           0.89      5856
   macro avg       0.88      0.85      0.86      5856
weighted avg       0.89      0.89      0.89      5856

              precision    recall  f1-score   support

          -1       0.82      0.92      0.87       932
           0       0.70      0.54      0.61       295
           1       0.75      0.60      0.67       237

    accuracy                           0.79      1464
   macro avg       0.76      0.69      0.72      1464
weighted avg       0.79      0.79      0.78      1464



# Vectorization and Machine Learning Models (With removing Stop Words)

In [491]:
# Removing stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [492]:
stop_nltk = set(nltk_stopwords.words('english'))

In [493]:
train["text"] = train["text"].apply(lambda text: " ".join(word for word in text.split() if word not in stop_nltk))
test["text"] = test["text"].apply(lambda text: " ".join(word for word in text.split() if word not in stop_nltk))

### Count Vectorization

In [431]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import  classification_report

In [495]:
# Splitting the train data into train and test
X = train['text']
y = train['Target']

In [506]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

#### Naive Bayes


In [435]:
# Creating a pipeline
pipeline = Pipeline([
    ('v', CountVectorizer(strip_accents='ascii', max_df=0.5)),
    ('c', MultinomialNB(fit_prior=True, class_prior=None))
    ])

# Setting parameters
params = {
   # 'v__max_df': [0.3,0.5,0.8],
    'v__ngram_range': [(1,1),(1,2),(1,3),(1,4)],
    'c__alpha': [0.25,0.5,0.75]
}

gridSearchNB1 = GridSearchCV(pipeline,params,cv=10,n_jobs=-1, scoring="f1_macro")

In [437]:
gridSearchNB1.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('v',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.5,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prepro

In [438]:
print("Best Parameters: ", gridSearchNB1.best_params_)
print("Train Score: {:.2f}".format(gridSearchNB1.score(X_train,y_train)))
print("Test Score: {:.2f}".format(gridSearchNB1.score(X_test,y_test)))

Best Parameters:  {'c__alpha': 0.25, 'v__ngram_range': (1, 1)}
Train Score: 0.87
Test Score: 0.67


In [439]:
print(classification_report(y_train,gridSearchNB1.predict(X_train)))
print(classification_report(y_test,gridSearchNB1.predict(X_test)))

              precision    recall  f1-score   support

          -1       0.90      0.96      0.93      3634
           0       0.87      0.72      0.79      1241
           1       0.89      0.87      0.88       981

    accuracy                           0.89      5856
   macro avg       0.89      0.85      0.87      5856
weighted avg       0.89      0.89      0.89      5856

              precision    recall  f1-score   support

          -1       0.81      0.91      0.86       932
           0       0.60      0.41      0.49       295
           1       0.69      0.64      0.66       237

    accuracy                           0.76      1464
   macro avg       0.70      0.65      0.67      1464
weighted avg       0.75      0.76      0.75      1464



#### Logistic Regression

In [440]:
from sklearn.linear_model import LogisticRegression

In [441]:
# Creating a pipeline
pipeline = Pipeline([
    ('v', CountVectorizer(strip_accents='ascii', max_df =0.5)),
    ('lr', LogisticRegression(solver='lbfgs', max_iter=10000))
    ])

# Setting parameters
params = {
    'v__ngram_range': [(1,1),(1,2),(1,3),(1,4)],
    'lr__penalty': ['l1','l2'],
    'lr__C': [0.1, 0.25, 0.5, 1.0]
}

gridSearchLR1 = GridSearchCV(pipeline,params,cv=5,n_jobs=-1, scoring="f1_macro")

In [442]:
gridSearchLR1.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('v',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.5,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        preproc

In [443]:
print("Best Parameters: ", gridSearchLR1.best_params_)
print("Train Score: {:.2f}".format(gridSearchLR1.score(X_train,y_train)))
print("Test Score: {:.2f}".format(gridSearchLR1.score(X_test,y_test)))

Best Parameters:  {'lr__C': 1.0, 'lr__penalty': 'l2', 'v__ngram_range': (1, 2)}
Train Score: 0.99
Test Score: 0.73


In [444]:
print(classification_report(y_train,gridSearchLR1.predict(X_train)))
print(classification_report(y_test,gridSearchLR1.predict(X_test)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      3634
           0       0.98      0.98      0.98      1241
           1       0.98      0.99      0.99       981

    accuracy                           0.99      5856
   macro avg       0.99      0.99      0.99      5856
weighted avg       0.99      0.99      0.99      5856

              precision    recall  f1-score   support

          -1       0.85      0.90      0.87       932
           0       0.63      0.57      0.60       295
           1       0.76      0.67      0.71       237

    accuracy                           0.79      1464
   macro avg       0.75      0.71      0.73      1464
weighted avg       0.79      0.79      0.79      1464



#### XGBoost

In [None]:
from xgboost.sklearn import XGBClassifier

In [497]:
cv = CountVectorizer(strip_accents='ascii', max_df =0.5)
X_trainxgb = cv.fit_transform(X_train)
X_testxgb = cv.transform(X_test)

xgb1 = XGBClassifier(objective='multi:softmax', seed=0, missing=None, max_depth=9,early_stopping_rounds = 5)
xgb1.fit(X_trainxgb,y_train)

print("Train score: {:.2f}".format(xgb1.score(X_trainxgb,y_train)))
print("Test score: {:.2f}".format(xgb1.score(X_testxgb,y_test)))

Train score: 0.85
Test score: 0.78


In [498]:
print(classification_report(y_train,xgb1.predict(X_trainxgb)))
print(classification_report(y_test,xgb1.predict(X_testxgb)))

              precision    recall  f1-score   support

          -1       0.88      0.94      0.91      3634
           0       0.75      0.71      0.73      1241
           1       0.86      0.73      0.79       981

    accuracy                           0.85      5856
   macro avg       0.83      0.79      0.81      5856
weighted avg       0.85      0.85      0.85      5856

              precision    recall  f1-score   support

          -1       0.82      0.90      0.86       932
           0       0.63      0.50      0.56       295
           1       0.73      0.62      0.67       237

    accuracy                           0.78      1464
   macro avg       0.73      0.68      0.70      1464
weighted avg       0.77      0.78      0.77      1464



## TFIDF

In [445]:
# Creating a pipeline
pipeline = Pipeline([
    ('v', TfidfVectorizer(norm=None, max_df=0.5)),
    ('c', MultinomialNB(fit_prior=True, class_prior=None))
    ])

# Setting parameters
params = {
    'v__ngram_range': [(1,1),(1,2),(1,3),(1,4)],
    'c__alpha': [0.25,0.5,0.75]
}

gridSearchNBtfidf1 = GridSearchCV(pipeline,params,cv=10,n_jobs=-1, scoring="f1_macro")

In [446]:
gridSearchNBtfidf1.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('v',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.5,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        norm

In [447]:
print("Best Parameters: ", gridSearchNBtfidf1.best_params_)
print("Train Score: {:.2f}".format(gridSearchNBtfidf1.score(X_train,y_train)))
print("Test Score: {:.2f}".format(gridSearchNBtfidf1.score(X_test,y_test)))

Best Parameters:  {'c__alpha': 0.75, 'v__ngram_range': (1, 2)}
Train Score: 0.98
Test Score: 0.67


In [448]:
print(classification_report(y_train,gridSearchNBtfidf1.predict(X_train)))
print(classification_report(y_test,gridSearchNBtfidf1.predict(X_test)))

              precision    recall  f1-score   support

          -1       0.99      0.99      0.99      3634
           0       0.98      0.96      0.97      1241
           1       0.97      0.99      0.98       981

    accuracy                           0.99      5856
   macro avg       0.98      0.98      0.98      5856
weighted avg       0.99      0.99      0.99      5856

              precision    recall  f1-score   support

          -1       0.83      0.88      0.85       932
           0       0.56      0.43      0.49       295
           1       0.65      0.68      0.67       237

    accuracy                           0.76      1464
   macro avg       0.68      0.66      0.67      1464
weighted avg       0.75      0.76      0.75      1464



In [449]:
# Creating a pipeline
pipeline = Pipeline([
    ('v', TfidfVectorizer(norm=None, max_df=0.5)),
    ('lr', LogisticRegression(solver='lbfgs'))
    ])

# Setting parameters
params = {
    'v__ngram_range': [(1,1),(1,2),(1,3),(1,4)],
    'lr__penalty': ['l1','l2'],
    'lr__C': [0.1, 0.25, 0.5, 1.0]
}

gridSearchLRtfidf1 = GridSearchCV(pipeline,params,cv=5,n_jobs=-1, scoring="f1_macro")

In [450]:
gridSearchLRtfidf1.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('v',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.5,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        norm=

In [451]:
print("Best Parameters: ", gridSearchLRtfidf1.best_params_)
print("Train Score: {:.2f}".format(gridSearchLRtfidf1.score(X_train,y_train)))
print("Test Score: {:.2f}".format(gridSearchLRtfidf1.score(X_test,y_test)))

Best Parameters:  {'lr__C': 0.1, 'lr__penalty': 'l2', 'v__ngram_range': (1, 2)}
Train Score: 0.99
Test Score: 0.73


In [452]:
print(classification_report(y_train,gridSearchLRtfidf1.predict(X_train)))
print(classification_report(y_test,gridSearchLRtfidf1.predict(X_test)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      3634
           0       0.99      0.99      0.99      1241
           1       0.99      0.99      0.99       981

    accuracy                           1.00      5856
   macro avg       0.99      0.99      0.99      5856
weighted avg       1.00      1.00      1.00      5856

              precision    recall  f1-score   support

          -1       0.85      0.90      0.87       932
           0       0.63      0.57      0.60       295
           1       0.78      0.68      0.73       237

    accuracy                           0.80      1464
   macro avg       0.75      0.72      0.73      1464
weighted avg       0.79      0.80      0.79      1464



#### XGBoost

In [None]:
from xgboost.sklearn import XGBClassifier

In [499]:
tfidf = TfidfVectorizer(norm=None, max_df =0.5)
X_trainxgb = tfidf.fit_transform(X_train)
X_testxgb = tfidf.transform(X_test)

xgbtfidf1 = XGBClassifier(objective='multi:softmax', seed=0, missing=None, max_depth=9,early_stopping_rounds = 5)
xgbtfidf1.fit(X_trainxgb,y_train)

print("Train score: {:.2f}".format(xgbtfidf1.score(X_trainxgb,y_train)))
print("Test score: {:.2f}".format(xgbtfidf1.score(X_testxgb,y_test)))

Train score: 0.85
Test score: 0.78


In [500]:
print(classification_report(y_train,xgbtfidf1.predict(X_trainxgb)))
print(classification_report(y_test,xgbtfidf1.predict(X_testxgb)))

              precision    recall  f1-score   support

          -1       0.88      0.94      0.91      3634
           0       0.75      0.71      0.73      1241
           1       0.86      0.73      0.79       981

    accuracy                           0.85      5856
   macro avg       0.83      0.79      0.81      5856
weighted avg       0.85      0.85      0.85      5856

              precision    recall  f1-score   support

          -1       0.82      0.90      0.86       932
           0       0.63      0.50      0.56       295
           1       0.73      0.62      0.67       237

    accuracy                           0.78      1464
   macro avg       0.73      0.68      0.70      1464
weighted avg       0.77      0.78      0.77      1464



# Conclusion

I observed following key points:
1. Stemming gave better accuracy than lemmitization.
2. Keeping stop words helped in improving accuracy, since sentiment analysis is sensitive to stop words.
3. Count Vectorization was better than tfidf in this case. 

Finally out of all models, XGBoost prevented overfitting however it didn't improve F1 score.

Logistic Regression without removing stop words gave the best F1 macro score. Hence, I decided to use it for my final prediction.




# Final prediction

In [552]:
# predicting outcome for test data 
test1 = pd.read_csv("test.csv", encoding='ISO-8859-1')
test1['Target'] = gridSearchLR.predict(test['text'])
test1 = test1.drop(['text'], axis=1)
test1.to_csv('final_pred.csv', index=False)