# Product Sentiment Classifier

## Data Cleaning and EDA
Import necessary libraries

In [1]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.collocations import *
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
#from sklearn.metrics import (classification_report, 
                             #plot_confusion_matrix)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import string
%matplotlib inline

%run -i "clean_lemmatize_token.py"
%run -i "report.py"

XGBoostError: XGBoost Library (xgboost.dll) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libgomp.so for UNIX-like OSes)
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['[WinError 127] The specified procedure could not be found']


Import data from data.world.

In [None]:
df = pd.read_csv('https://query.data.world/s/zbehvjkmiewbkln44rae6iphum4v3g', 
                 encoding = "ISO-8859-1")
df.head()

Rename columns

In [None]:
df=df.rename(columns = {'emotion_in_tweet_is_directed_at':
                        'brand_product',
                        'is_there_an_emotion_directed_at_a_brand_or_product':
                        'sentiment'})
df.head()

Explored data set with info() method. 1 NaN value present in tweet_text column and ~6,000 in brand_product column. Will need to address prior to modeling

In [None]:
df.info()

Remove NaN tweet_text from DataFrame

In [None]:
df[df['tweet_text'].isna()]

In [None]:
df.drop(inplace= True, index=6)

### Exploration
Explored Sentiment category. Most tweets are marked as having No emotion which will not help initial binary classification model. Most data in data set will only be usuable when model is built to take into account neutral sentiment

In [None]:
df.groupby('sentiment').count()

In [None]:
df.loc[df['sentiment'] == "I can't tell"]

In [None]:
df['sentiment'].value_counts()

Dummied Sentiment Column to help with visulizations to compare sentiment across brands

In [None]:
df1=pd.get_dummies(df['sentiment'])
df1.head()

In [None]:
df_dummied=df.join(df1).drop(columns='sentiment')
df_dummied.head()

Made visualization to explore distribution of sentiment across brand/product. Will combine Apple and Google products to further explore distribution. Sentiment is overwhelmingly positive across all products and most sentiment data is logged for Apple products

In [None]:
df_dummied.groupby('brand_product').sum().plot(kind='barh', 
                                               figsize=(10,7))
plt.title('Sentiment Analysis by Brand/Product', size=20)
plt.ylabel('Brand/Product', size=15)
plt.xlabel('# of Instances', size=15)
plt.show()

Made below visualization to explore the missing brand_product classifications for each sentiment

In [None]:
df.groupby('sentiment').count().plot(kind='barh',
                                     figsize=(10,5))
plt.title('Sentiment Distribution', size=20)
plt.ylabel('Sentiment', size=15)
plt.xlabel('# of Instances',size=15)
plt.show()

Usable data (Positive or Negative sentiment) for baseline first model is 39.02%. Will be necessary to eventually build a multiclass classifier with No Emotion classification

In [None]:
usable_data=round(len(df.loc[(df['sentiment'] == 'Positive emotion') | 
                             (df['sentiment'] == 'Negative emotion')])/len(df) * 100,2)
print('Percentage of Data with either Positive or Negative Sentiment: {}%'.format(usable_data))

Combined Apple and Google product names together to explore further visualizations

In [None]:
df_dummied['brand_product']=df_dummied['brand_product'].replace(to_replace = ['iPad','Apple',
                                                                                    'iPad or iPhone App','iPhone',
                                                                                    'Other Apple product or service'],value='apple_product')
df_dummied['brand_product'] = df_dummied['brand_product'].replace(['Google',
                                                                        'Other Google product or service',
                                                                        'Android App', 'Android'],'android_product')
df_dummied['brand_product'].value_counts()

Combined Brand_product columns to show difference in amount of information for each product

In [None]:
df_dummied.groupby('brand_product').sum().plot(kind='barh', figsize=(10,4))
plt.title('Sentiment Analysis by Brand/Product Combined', size=20)
plt.ylabel('Brand/Product', size=15)
plt.xlabel('# of Instances', size=15)
plt.yticks(np.arange(2),['Google Product','Apple Product'])
plt.show()

Made a copy of the data frame to further explore EDA options

In [None]:
df_eda = df
df_eda.head()

Used an alternative version of our clean_lemmatize_token function to preprocess data for additional EDA

In [None]:
def clean_lemmatize_token_alt(tweet):
    stop_words = set(stopwords.words('english'))
    cleaned = tweet.translate(str.maketrans('', '', string.punctuation)).lower()
    tokenized = word_tokenize(cleaned)
    filtered = [w for w in tokenized if not w in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    for word in filtered:
        lemmatized.append(lemmatizer.lemmatize(word))
    to_remove = ['rt','mention','sxsw','link']
    lemmatized = [w for w in lemmatized if w not in to_remove]
    #lemmatized = ' '.join(lemmatized)
    return lemmatized

In [None]:
df_eda['tweet_text'] = df_eda['tweet_text'].map(lambda x: clean_lemmatize_token_alt(x))
df_eda.head()

In [None]:
df_eda['brand_product'].value_counts()

In [None]:
df_pos_apple_mobile = df_eda.loc[(df_eda['sentiment'] == 'Positive emotion') & (df_eda['brand_product'] == 'iPad') 
                                 | (df_eda['brand_product'] == 'iPhone') | (df_eda['brand_product'] == 'iPad or iPhone App')]
df_neg_apple_mobile = df_eda.loc[(df_eda['sentiment'] == 'Negative emotion') & (df_eda['brand_product'] == 'iPad') 
                                 | (df_eda['brand_product'] == 'iPhone') | (df_eda['brand_product'] == 'iPad or iPhone App')]
df_pos_android_mobile = df_eda.loc[(df_eda['sentiment'] == 'Positive emotion') & (df_eda['brand_product'] == 'Other Google Product or Service') 
                                 | (df_eda['brand_product'] == 'Android App') | (df_eda['brand_product'] == 'Android')]
df_neg_android_mobile = df_eda.loc[(df_eda['sentiment'] == 'Negative emotion') & (df_eda['brand_product'] == 'Other Google Product or Service') 
                                 | (df_eda['brand_product'] == 'Android App') | (df_eda['brand_product'] == 'Android')]

In [None]:
pos_apple_mobile_tweet_list = df_pos_apple_mobile['tweet_text']
neg_apple_mobile_tweet_list = df_neg_apple_mobile['tweet_text']
pos_android_mobile_list = df_pos_android_mobile['tweet_text']
neg_android_mobile_list = df_neg_android_mobile['tweet_text']

In [None]:
pos_apple_mobile_concat = []
neg_apple_mobile_concat = []
pos_android_mobile_concat = []
neg_android_mobile_concat = []

for tweet in pos_apple_mobile_tweet_list:
    pos_apple_mobile_concat += tweet
for tweet in neg_apple_mobile_tweet_list:
    neg_apple_mobile_concat += tweet
for tweet in pos_android_mobile_list:
    pos_android_mobile_concat += tweet  
for tweet in neg_android_mobile_list:
    neg_android_mobile_concat += tweet

In [None]:
len(pos_apple_mobile_concat)

In [None]:
mobile_concat_list = [pos_apple_mobile_concat, neg_apple_mobile_concat, pos_android_mobile_concat,
                      neg_android_mobile_concat]

In [None]:
mobile_tweets_scored_list = []
for mobile_list in mobile_concat_list:
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    tweet_finder = BigramCollocationFinder.from_words(mobile_list)
    tweets_scored = tweet_finder.score_ngrams(bigram_measures.raw_freq)
    mobile_tweets_scored_list.append(tweets_scored)

In [None]:
df_mobile_graph1 = pd.DataFrame(mobile_tweets_scored_list[0][:10], columns = ['Bigram','Raw_Frequency_Score'])
df_mobile_graph2 = pd.DataFrame(mobile_tweets_scored_list[1][:10], columns = ['Bigram','Raw_Frequency_Score'])
df_mobile_graph3 = pd.DataFrame(mobile_tweets_scored_list[2][:10], columns = ['Bigram','Raw_Frequency_Score'])
df_mobile_graph4 = pd.DataFrame(mobile_tweets_scored_list[3][:10], columns = ['Bigram','Raw_Frequency_Score'])
fig, axs = plt.subplots(2,2)
fig.set_figheight(17)
fig.set_figwidth(23)
axs[0,0].title.set_text('Positive Apple Mobile Tweets Bigrams')
axs[0,1].title.set_text('Negative Apple Mobile Tweets Bigrams')
axs[1,0].title.set_text('Positive Android Mobile Tweets Bigrams')
axs[1,1].title.set_text('Negative Android Mobile Tweets Bigrams')
sns.barplot(data=df_mobile_graph1, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[0,0])
sns.barplot(data=df_mobile_graph2, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[0,1])
sns.barplot(data=df_mobile_graph3, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[1,0])
sns.barplot(data=df_mobile_graph4, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[1,1])
plt.show()

Combined all Apple products and all Google products together

In [None]:
df_eda['brand_product']=df_eda['brand_product'].replace(to_replace = ['iPad','Apple',
                                                                                    'iPad or iPhone App','iPhone',
                                                                                    'Other Apple product or service'],value='apple_product')
df_eda['brand_product'] = df_eda['brand_product'].replace(['Google',
                                                                        'Other Google product or service',
                                                                        'Android App', 'Android'],'android_product')
df_eda['brand_product'].value_counts(
)

Created new data frames based on different brands and sentiments to plot bigrams of most common word pairs

In [None]:
df_pos = df_eda.loc[df_eda['sentiment'] == 'Positive emotion']
df_neg = df_eda.loc[df_eda['sentiment'] == 'Negative emotion']
df_none = df_eda.loc[df_eda['sentiment'] == 'No emotion toward brand or product']
df_apple = df_eda.loc[df_eda['brand_product'] == 'apple_product']
df_google = df_eda.loc[df_eda['brand_product'] == 'android_product']
df_pos_apple = df_eda.loc[(df_eda['brand_product'] == 'apple_product') & (df_eda['sentiment'] == 'Positive emotion')]
df_neg_apple = df_eda.loc[(df_eda['brand_product'] == 'apple_product') & (df_eda['sentiment'] == 'Negative emotion')]
df_none_apple = df_eda.loc[(df_eda['brand_product'] == 'apple_product') & (df_eda['sentiment'] == 'No emotion toward brand or product')]
df_pos_google = df_eda.loc[(df_eda['brand_product'] == 'android_product') & (df_eda['sentiment'] == 'Positive emotion')]
df_neg_google = df_eda.loc[(df_eda['brand_product'] == 'android_product') & (df_eda['sentiment'] == 'Negative emotion')]
df_none_google = df_eda.loc[(df_eda['brand_product'] == 'android_product') & (df_eda['sentiment'] == 'No emotion toward brand or product')]

In [None]:
tweet_list = df_eda['tweet_text']
pos_tweet_list = df_pos['tweet_text']
neg_tweet_list = df_neg['tweet_text']
none_tweet_list = df_none['tweet_text']
apple_tweet_list = df_apple['tweet_text']
apple_pos_tweet_list = df_pos_apple['tweet_text']
apple_neg_tweet_list = df_neg_apple['tweet_text']
apple_none_tweet_list = df_none_apple['tweet_text']
google_tweet_list = df_google['tweet_text']
google_pos_tweet_list = df_pos_google['tweet_text']
google_neg_tweet_list = df_neg_google['tweet_text']
google_none_tweet_list = df_none_google['tweet_text']

In [None]:
tweet_concat = []
pos_tweet_concat = []
neg_tweet_concat = []
none_tweet_concat = []
apple_tweet_concat = []
apple_pos_tweet_concat = []
apple_neg_tweet_concat = []
apple_none_tweet_concat = []
google_tweet_concat = []
google_pos_tweet_concat = []
google_neg_tweet_concat = []
google_none_tweet_concat = []

for tweet in tweet_list:
    tweet_concat += tweet
for tweet in pos_tweet_list:
    pos_tweet_concat += tweet
for tweet in neg_tweet_list:
    neg_tweet_concat += tweet  
for tweet in none_tweet_list:
    none_tweet_concat += tweet

for tweet in apple_tweet_list:
    apple_tweet_concat += tweet
for tweet in apple_pos_tweet_list:
    apple_pos_tweet_concat += tweet
for tweet in apple_neg_tweet_list:
    apple_neg_tweet_concat += tweet  
for tweet in apple_none_tweet_list:
    apple_none_tweet_concat += tweet

for tweet in google_tweet_list:
    google_tweet_concat += tweet
for tweet in google_pos_tweet_list:
    google_pos_tweet_concat += tweet
for tweet in google_neg_tweet_list:
    google_neg_tweet_concat += tweet  
for tweet in google_none_tweet_list:
    google_none_tweet_concat += tweet

In [None]:
concat_list=[tweet_concat,
pos_tweet_concat,
neg_tweet_concat,
none_tweet_concat,
apple_tweet_concat,
apple_pos_tweet_concat,
apple_neg_tweet_concat,
apple_none_tweet_concat,
google_tweet_concat,
google_pos_tweet_concat,
google_neg_tweet_concat,
google_none_tweet_concat]

Used a for loop to collect Raw Frequency Scores for each subset of the data

In [None]:
tweets_scored_list = []
for list in concat_list:
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    tweet_finder = BigramCollocationFinder.from_words(list)
    tweets_scored = tweet_finder.score_ngrams(bigram_measures.raw_freq)
    tweets_scored_list.append(tweets_scored)

Plotted the All Tweets Bigrams

In [None]:
df_graph1 = pd.DataFrame(tweets_scored_list[0][:10], columns = ['Bigram','Raw_Frequency_Score'])
df_graph2 = pd.DataFrame(tweets_scored_list[1][:10], columns = ['Bigram','Raw_Frequency_Score'])
df_graph3 = pd.DataFrame(tweets_scored_list[2][:10], columns = ['Bigram','Raw_Frequency_Score'])
df_graph4 = pd.DataFrame(tweets_scored_list[3][:10], columns = ['Bigram','Raw_Frequency_Score'])
fig, axs = plt.subplots(2,2)
fig.set_figheight(17)
fig.set_figwidth(23)
axs[0,0].title.set_text('All Tweets Bigrams')
axs[0,1].title.set_text('All Positive Tweets Bigrams')
axs[1,0].title.set_text('All Negative Tweets Bigrams')
axs[1,1].title.set_text('All No Emotion Tweets Bigrams')
sns.barplot(data=df_graph1, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[0,0])
sns.barplot(data=df_graph2, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[0,1])
sns.barplot(data=df_graph3, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[1,0])
sns.barplot(data=df_graph4, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[1,1])
plt.show()

Plotted the Apple Tweets Bigrams

In [None]:
df_graph5 = pd.DataFrame(tweets_scored_list[4][:10], columns = ['Bigram','Raw_Frequency_Score'])
df_graph6 = pd.DataFrame(tweets_scored_list[5][:10], columns = ['Bigram','Raw_Frequency_Score'])
df_graph7 = pd.DataFrame(tweets_scored_list[6][:10], columns = ['Bigram','Raw_Frequency_Score'])
df_graph8 = pd.DataFrame(tweets_scored_list[7][:10], columns = ['Bigram','Raw_Frequency_Score'])
fig, axs = plt.subplots(2,2)
fig.set_figheight(17)
fig.set_figwidth(23)
axs[0,0].title.set_text('All Apple Tweets Bigrams')
axs[0,1].title.set_text('All Positive Apple Tweets Bigrams')
axs[1,0].title.set_text('All Negative Apple Tweets Bigrams')
axs[1,1].title.set_text('All No Emotion Apple Tweets Bigrams')
sns.barplot(data=df_graph5, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[0,0])
sns.barplot(data=df_graph6, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[0,1])
sns.barplot(data=df_graph7, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[1,0])
sns.barplot(data=df_graph8, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[1,1])
plt.show()

Plotted the Google Tweets Bigrams

In [None]:
df_graph9 = pd.DataFrame(tweets_scored_list[8][:10], columns = ['Bigram','Raw_Frequency_Score'])
df_graph10 = pd.DataFrame(tweets_scored_list[9][:10], columns = ['Bigram','Raw_Frequency_Score'])
df_graph11 = pd.DataFrame(tweets_scored_list[10][:10], columns = ['Bigram','Raw_Frequency_Score'])
df_graph12 = pd.DataFrame(tweets_scored_list[11][:10], columns = ['Bigram','Raw_Frequency_Score'])
fig, axs = plt.subplots(2,2)
fig.set_figheight(17)
fig.set_figwidth(23)
axs[0,0].title.set_text('All Google Tweets Bigrams')
axs[0,1].title.set_text('All Positive Google Tweets Bigrams')
axs[1,0].title.set_text('All Negative Google Tweets Bigrams')
axs[1,1].title.set_text('All No Emotion Google Tweets Bigrams')
sns.barplot(data=df_graph9, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[0,0])
sns.barplot(data=df_graph10, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[0,1])
sns.barplot(data=df_graph11, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[1,0])
sns.barplot(data=df_graph12, y='Bigram',x='Raw_Frequency_Score', orient='h', ax = axs[1,1])
plt.show()

### Preprocessing Tweets
Clean, lemmatize, and format data for vectorization and modeling

In [None]:
df['tweet_text'] = df['tweet_text'].map(clean_lemmatize_token)

### binary classification model
Split the DataFrame to take only binary options 

In [None]:
binary = df[(df['sentiment'] == 'Negative emotion')|
            (df['sentiment'] == 'Positive emotion')]

vectorize the data

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(binary['tweet_text'])
Y = binary['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2)

### Logistic Regression

In [None]:
logreg = LogisticRegression(solver= 'lbfgs', 
                            multi_class = 'auto', 
                            max_iter = 400, 
                            class_weight = 'balanced')
report(logreg)

In [None]:
rand_tree = RandomForestClassifier()
report(rand_tree)

# Binary Support Vector Machine Modeling

In [None]:
df_modeling = df_eda.loc[(df_eda['sentiment'] == 'Positive emotion') | 
                         (df_eda['sentiment'] == 'Negative emotion')]
df_modeling['sentiment'] = df_modeling['sentiment'].replace({'Positive emotion':1,'Negative emotion':0})

In [None]:
df_modeling['tweet_text'] = df_modeling['tweet_text'].map(lambda x: ' '.join(x))
df_modeling.head()

In [None]:
tf_idfvectorizer = TfidfVectorizer(max_features = 400)

In [None]:
X = df_modeling['tweet_text']
y = df_modeling['sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25, random_state=42)

In [None]:
tf_idfvectorizer.fit(X)
X_train_tfidf = tf_idfvectorizer.transform(X_train)
X_test_tfidf = tf_idfvectorizer.transform(X_test)

In [None]:
svm_clf = SVC(kernel='linear', class_weight='balanced')
svm_clf.fit(X_train_tfidf,y_train)
y_test_preds = svm_clf.predict(X_test_tfidf)
y_train_preds = svm_clf.predict(X_train_tfidf)
print(classification_report(y_test, y_test_preds))
print(confusion_matrix(y_test, y_test_preds))
print("Test SVM Accuracy Score -> ",accuracy_score(y_test, y_test_preds)*100)
print("Training SVM Accuracy Score -> ",accuracy_score(y_train, y_train_preds)*100)

In [None]:
params_svc = {'C': [0.1, 1, 10, 100], 'gamma':[1,0.1,0.01,0.001]}
estimator_svc = SVC(kernel='linear',
                    class_weight='balanced')
grid_search_svc = GridSearchCV(estimator=estimator_svc, 
                              param_grid=params_svc,
                              scoring = 'accuracy',
                              n_jobs = -1,
                              cv = 10,
                              verbose = True)

In [None]:
grid_search_svc.fit(X_train_tfidf, y_train)

In [None]:
grid_search_svc.best_params_

In [None]:
svm_clf = SVC(kernel='linear', class_weight='balanced', C = 10, gamma= 1)
svm_clf.fit(X_train_tfidf,y_train)
y_test_preds = svm_clf.predict(X_test_tfidf)
y_train_preds = svm_clf.predict(X_train_tfidf)
print(classification_report(y_test, y_test_preds))
print(confusion_matrix(y_test, y_test_preds))
print("Test SVM Accuracy Score -> ",accuracy_score(y_test, y_test_preds)*100)
print("Training SVM Accuracy Score -> ",accuracy_score(y_train, y_train_preds)*100)

# Binary XGBoost Model

In [None]:
xgbc_clf = XGBClassifier()
xgbc_clf.fit(X_train_tfidf, y_train)
y_test_preds = xgbc_clf.predict(X_test_tfidf)
y_train_preds = xgbc_clf.predict(X_train_tfidf)
print(classification_report(y_test, y_test_preds))
print(confusion_matrix(y_test, y_test_preds))
print("Test SVM Accuracy Score -> ",accuracy_score(y_test, y_test_preds)*100)
print("Training SVM Accuracy Score -> ",accuracy_score(y_train, y_train_preds)*100)

In [None]:
train_test_difference = []
train_score = []
test_score = []
for i in np.arange(0.01, 1.01, 0.01):
    xgbc_clf = XGBClassifier(scale_pos_weight=i)
    xgbc_clf.fit(X_train_tfidf, y_train)
    y_test_preds = xgbc_clf.predict(X_test_tfidf)
    y_train_preds = xgbc_clf.predict(X_train_tfidf)
    train_test_difference.append((accuracy_score(y_train, y_train_preds) * 100) - 
                                 (accuracy_score(y_test, y_test_preds)*100))
    train_score.append(accuracy_score(y_train, y_train_preds) * 100)
    test_score.append(accuracy_score(y_test, y_test_preds)*100)

In [None]:
df_results=pd.DataFrame([np.arange(0.01, 1.01, 0.01), train_test_difference, train_score, test_score]).T
df_results=df_results.rename(columns = {0:'pos_scale_weight',1:'train_test_difference', 2:'train_score', 3:'test_score'})
df_results.sort_values(['train_test_difference','test_score'])

In [None]:
xgbc_clf = XGBClassifier(scale_pos_weight=.96)
xgbc_clf.fit(X_train_tfidf, y_train)
y_test_preds = xgbc_clf.predict(X_test_tfidf)
y_train_preds = xgbc_clf.predict(X_train_tfidf)
print(classification_report(y_test, y_test_preds))
print(confusion_matrix(y_test, y_test_preds))
print("Test SVM Accuracy Score -> ",accuracy_score(y_test, y_test_preds)*100)
print("Training SVM Accuracy Score -> ",accuracy_score(y_train, y_train_preds)*100)

In [None]:
params_xgboost = {'max_depth': range(2,10,1),
         'n_estimators': range(60,220,40),
         'learning_rate': [0.1, 0.01, 0.05]}
estimator_xgboost = XGBClassifier(
    objective= 'binary:logistic',
    n_jobs=-1,
    seed=42, 
    pos_scale_weight = .96
)
grid_search_xgboost = GridSearchCV(
    estimator=estimator_xgboost,
    param_grid=params_xgboost,
    scoring = 'roc_auc',
    n_jobs = -1,
    cv = 10,
    verbose=True)

In [None]:
grid_search_xgboost.fit(X_train_tfidf, y_train)

In [None]:
grid_search_xgboost.best_params_

In [None]:
xgbc_clf = XGBClassifier(scale_pos_weight=.96, learning_rate = 0.1, max_depth = 3, n_estimators = 180)
xgbc_clf.fit(X_train_tfidf, y_train)
y_test_preds = xgbc_clf.predict(X_test_tfidf)
y_train_preds = xgbc_clf.predict(X_train_tfidf)
print(classification_report(y_test, y_test_preds))
print(confusion_matrix(y_test, y_test_preds))
print("Test SVM Accuracy Score -> ",accuracy_score(y_test, y_test_preds)*100)
print("Training SVM Accuracy Score -> ",accuracy_score(y_train, y_train_preds)*100)