In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn import preprocessing
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.naive_bayes import BernoulliNB

#### Drill: Regression or Classification

For each of the following situations, decide if you would model using a regression or classification model. Discuss your reasoning with your mentor next time you meet.

    The amount a person will spend on a given site in the next 24 months.
        
        regression

    What color car someone is going to buy.
    
        classification

    How many children a family will have.
    
        classification

    If someone will sign up for a service.
    
        classification

    The number of times someone will get sick in a year.
    
        regression

    The probability someone will get sick in the next month.
    
        regression

    Which medicine will work best for a given patient.
    
        classification


In [21]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

In [22]:
sms_raw

Unnamed: 0,spam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [23]:
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [24]:
sms_raw['allcaps'] = sms_raw.message.str.isupper()

In [25]:
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
# Note that if you run this cell a second time everything will become false.
# So... Don't.

In [26]:
sns.heatmap(sms_raw.corr())

<matplotlib.axes._subplots.AxesSubplot at 0x2db685b3400>

In [27]:
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

In [38]:
sms_raw.head()

Unnamed: 0,spam,message,click,offer,winner,buy,free,cash,urgent,allcaps
0,False,"Go until jurong point, crazy.. Available only ...",False,False,False,False,False,False,False,False
1,False,Ok lar... Joking wif u oni...,False,False,False,False,False,False,False,False
2,True,Free entry in 2 a wkly comp to win FA Cup fina...,False,False,False,False,False,False,False,False
3,False,U dun say so early hor... U c already then say...,False,False,False,False,False,False,False,False
4,False,"Nah I don't think he goes to usf, he lives aro...",False,False,False,False,False,False,False,False


In [28]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 5572 points : 604


#### Challenge: Feedback Analysis

In [12]:
amazon_path = "https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/sentiment_labelled_sentences/amazon_cells_labelled.txt"
imdb_path ="https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/sentiment_labelled_sentences/imdb_labelled.txt"
yelp_path ="https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/sentiment_labelled_sentences/yelp_labelled.txt"

In [15]:
keywords = ['good', 'great', 'best', 'delicious', 'oustanding', 'bad', 'terrible', 'worst', 'never', 'broken', 'boring', 'deplorable']

#### Amazon

In [None]:
amazon = pd.read_csv(amazon_path, delimiter= '\t', header=None)
amazon.columns = ["Message","Sentiment"]
for key in keywords:
    amazon[str(key)] = amazon.Message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [25]:
data_amazon = amazon[keywords]
target_amazon = amazon['Sentiment']

In [26]:
bnb = BernoulliNB()

bnb.fit(data_amazon, target_amazon)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data_amazon)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data_amazon.shape[0],
    (target_amazon != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 429


#### IMDB

In [None]:
imdb = pd.read_csv(imdb_path, delimiter='\t', header=None)
imdb.columns = ["Message","Sentiment"]
for key in keywords:
    imdb[str(key)] = imdb.Message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [27]:
data_imdb = imdb[keywords]
target_imdb = imdb['Sentiment']

In [28]:
bnb = BernoulliNB()

bnb.fit(data_imdb, target_imdb)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data_imdb)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data_imdb.shape[0],
    (target_imdb != y_pred).sum()
))

Number of mislabeled points out of a total 748 points : 323


#### Yelp

In [13]:
yelp = pd.read_csv(yelp_path, delimiter='\t', header=None)
yelp.columns = ["Message","Sentiment"]
for key in keywords:
    yelp[str(key)] = yelp.Message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [29]:
data_yelp = yelp[keywords]
target_yelp = yelp['Sentiment']

In [30]:
bnb = BernoulliNB()

bnb.fit(data_yelp, target_yelp)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data_yelp)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data_yelp.shape[0],
    (target_yelp != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 418


For the feature engineering I choose words that were either strongly positive or negative I felt. Some are more specific like, delicious which is only really pertaining to food.  