# Modeling with Text Only to Predict Sentiment

In [1]:
# imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## week 3 imports
import missingno as msno     # msno.bar(titanic);  or msno.matrix(titanic);
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Linear and general modeling imports
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Feature Engineering
from sklearn.impute import SimpleImputer   # Imputation 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures   # Scale/transform/feature engineering

import patsy
# y, X = patsy.dmatrices(formula, data=diamonds, return_type='dataframe')

# GridSearch and Hyperparameter Tuning
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

# Logistic and Classification metrics
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, plot_roc_curve, roc_auc_score, recall_score, precision_score, f1_score, classification_report

# K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.preprocessing import StandardScaler

# naive bayes imports
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# SVMs
from sklearn.svm import LinearSVC, SVC

# Decision Trees
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text

# Import Bagging, Boosting, and Random Forests, and ExtraTrees (Extremely Randomized Trees)
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, AdaBoostClassifier, AdaBoostRegressor, RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor

# NLP imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# nltk.download()  --> Download all, and then restart jupyter lab
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import FreqDist, pos_tag
import re

import json

In [2]:
df = pd.read_csv('../../data/clean_tweets.csv')
df.head()

Unnamed: 0,airline_sentiment,retweet_count,text,tweet_created,tweet_day,tweet_month,tweet_hour,airline_Delta,airline_Southwest,airline_US Airways,airline_United,airline_Virgin America,clean_text_stem,sentiment_score,clean_word_count
0,neutral,0,I didn't today... Must mean I need to take an...,2015-02-24 11:15:48-08:00,1,2,11,0,0,0,0,1,today must mean need take anoth trip,0.0,7
1,negative,0,"it's really aggressive to blast obnoxious ""en...",2015-02-24 11:15:36-08:00,1,2,11,0,0,0,0,1,realli aggress blast obnoxi entertain guest fa...,-0.058824,10
2,negative,0,seriously would pay $30 a flight for seats th...,2015-02-24 11:14:33-08:00,1,2,11,0,0,0,0,1,serious would pay 30 flight seat play realli b...,-0.041667,12
3,positive,0,"yes, nearly every time I fly VX this “ear wor...",2015-02-24 11:13:57-08:00,1,2,11,0,0,0,0,1,ye nearli everi time fli vx ear worm go away,0.0,10
4,neutral,0,Really missed a prime opportunity for Men Wit...,2015-02-24 11:12:29-08:00,1,2,11,0,0,0,0,1,realli miss prime opportun men without hat par...,0.0,11


In [3]:
# Finds the baseline accuracy
df['airline_sentiment'].value_counts(normalize=True)

negative    0.710114
neutral     0.165570
positive    0.124316
Name: airline_sentiment, dtype: float64

In [4]:
# Sets up X and y
X = df['clean_text_stem']
y = df['airline_sentiment']

In [5]:
# Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    random_state=42)

In [8]:
# Instantiates a pipeline
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('bag', BaggingClassifier())
])

In [9]:
# Creates the pipe parameters 
pipe_params = {
    'cvec__max_features': [4000, 5000],
    'cvec__min_df':[1, 2],
    'cvec__max_df':[0.90, 0.98],
    'cvec__ngram_range':[(1,1), (1,2)],
    'bag__n_estimators': [10, 50, 100]
}

In [10]:
# Instantiates the GridSearchCV
gs = GridSearchCV(pipe, 
                param_grid=pipe_params,
                cv=5, 
                verbose=1,
                n_jobs = 4)

In [11]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.4min


KeyboardInterrupt: 

In [21]:
train1, test1 = gs.score(X_train, y_train), gs.score(X_test, y_test)
train1, test1

(0.7165636588380717, 0.7137560252131998)

In [22]:
gs.best_params_

{'cvec__max_df': 0.98,
 'cvec__max_features': 2000,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 2),
 'rf__max_depth': 10,
 'rf__n_estimators': 100}

---
### Second Gridsearch

In [26]:
# Instantiates a pipeline
pipe2 = Pipeline([
    ('cvec', CountVectorizer()),
    ('logreg', LogisticRegression())
])

In [17]:
# Creates the pipe parameters 
pipe_params2 = {
    'cvec__max_features': [5000, 6000, 7000],
    'cvec__min_df':[1],
    'cvec__max_df':[0.70, 0.80, 0.90],
    'cvec__ngram_range':[(1,2)],
    'logreg__C': [0.1, ]
}

In [18]:
# Instantiates the GridSearchCV
gs = GridSearchCV(pipe, 
                param_grid=pipe_params,
                cv=5, 
                verbose=1,
                n_jobs = 4)

In [19]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    8.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   35.9s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 480 out of 480 | elapsed:  1.7min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('logreg', LogisticRegression())]),
             n_jobs=4,
             param_grid={'cvec__max_df': [0.9, 0.98],
                         'cvec__max_features': [2000, 3000, 4000, 5000],
                         'cvec__min_df': [1, 2],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'logreg__C': [1, 0.1, 0.01]},
             verbose=1)