In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
amazon = pd.read_csv('/Users/spencerfogelman/Downloads/sentiment labelled sentences/amazon_cells_labelled.txt', header=None, delimiter='\t').dropna()

In [4]:
amazon.columns = ['Message', 'Classification']

In [5]:
print(amazon.head())
print(amazon.shape)

                                             Message  Classification
0  So there is no way for me to plug it in here i...               0
1                        Good case, Excellent value.               1
2                             Great for the jawbone.               1
3  Tied to charger for conversations lasting more...               0
4                                  The mic is great.               1
(1000, 2)


In [6]:
amazon['Classification'].value_counts()

1    500
0    500
Name: Classification, dtype: int64

# Model 1

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
text = amazon["Message"]
vectorizer = CountVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(text).toarray()
print(X)
print(X.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(1000, 7895)


In [8]:
y = amazon['Classification']

In [9]:
vectorizer.get_feature_names()[:50]

['10',
 '10 minutes',
 '10 of',
 '10 series',
 '100',
 '100 functional',
 '11',
 '11 months',
 '12',
 '12 minutes',
 '13',
 '13 bucks',
 '15',
 '15 seconds',
 '15g',
 '15g and',
 '18',
 '18 months',
 '20',
 '20 feet',
 '20 left',
 '2000',
 '2005',
 '2005 just',
 '2160',
 '2160 from',
 '24',
 '24 hours',
 '2mp',
 '2mp and',
 '325',
 '325 cellphone',
 '350',
 '350 headset',
 '375',
 '375 and',
 '3o',
 '3o minutes',
 '42',
 '42 usb',
 '44',
 '44 until',
 '45',
 '45 minutes',
 '4s',
 '4s despite',
 '50',
 '50 cent',
 '50 down',
 '5020']

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [11]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
print(bnb.score(X_train, y_train))
print(bnb.score(X_test, y_test))

0.98625
0.765


In [12]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(bnb, X, y, cv=10))
print(cross_val_score(bnb, X, y, cv=10).mean())

[0.84 0.83 0.86 0.79 0.81 0.74 0.77 0.78 0.8  0.78]
0.8


# Model 2

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_new = TfidfVectorizer()
X_new = vectorizer_new.fit_transform(text).toarray()
print(X.shape)

(1000, 7895)


In [14]:
bnb_new = BernoulliNB()

In [15]:
X_train,X_test, y_train,y_test = train_test_split(X_new, y, test_size=0.2, random_state=1)
bnb_new.fit(X_train, y_train)
bnb_new.score(X_train, y_train)

0.9675

In [16]:
bnb_new.score(X_test, y_test)

0.795

# Model 3

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
steps = [('vectorizer', TfidfVectorizer()),
        ('bnb', BernoulliNB())]
pipeline = Pipeline(steps)
parameters = {'vectorizer__ngram_range': [(1,1), (1,2), (1,3)],
             'vectorizer__stop_words': [None, 'english'],
              'vectorizer__norm': ['l1', 'l2'],
              'vectorizer__min_df': [0.01, 0.05, 0.1, 1],
              'vectorizer__max_df': [0.8, 0.9, 1.0]
             }
X = amazon['Message']
y = amazon['Classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)
print(cv.score(X_train, y_train))
print(cv.score(X_test, y_test))


0.9875
0.81


In [18]:
#How to do tfidf on only subset of data?
#length of uppercase

In [19]:
print(cv.best_params_)

{'vectorizer__max_df': 0.8, 'vectorizer__norm': 'l1', 'vectorizer__min_df': 1, 'vectorizer__stop_words': 'english', 'vectorizer__ngram_range': (1, 3)}


In [20]:
amazon['Text_length'] = amazon['Message'].apply(lambda x: len(x))
print(amazon.head())

                                             Message  Classification  \
0  So there is no way for me to plug it in here i...               0   
1                        Good case, Excellent value.               1   
2                             Great for the jawbone.               1   
3  Tied to charger for conversations lasting more...               0   
4                                  The mic is great.               1   

   Text_length  
0           82  
1           27  
2           22  
3           79  
4           17  
