In [94]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from itertools import combinations

import nltk
import unicodedata
import string
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_lg")

%matplotlib inline

## Part 1

In [3]:
%%bash
%cd ~/galvanized/week10/naive-bayes-hw
python main.py

Our belief before starting the experiment is equally distributed {1: 0.3333333333333333, 2: 0.3333333333333333, 3: 0.3333333333333333}
--------------------------------------------------
selected coin is 1, we dont know this, we need to find through experiment and verify
--------------------------------------------------
flip a coin
flip: T
evidence of getting T based on our belief is: [0.2333333333333333, 0.18333333333333335, 0.08333333333333333]
 
updating our belief based on evidence {1: 0.4666666666666667, 2: 0.36666666666666675, 3: 0.16666666666666669}
--------------------------------------------------
flip a coin
flip: H
evidence of getting H based on our belief is: [0.13999999999999999, 0.16500000000000004, 0.125]
 
updating our belief based on evidence {1: 0.3255813953488371, 2: 0.38372093023255816, 3: 0.2906976744186046}
--------------------------------------------------
flip a coin
flip: T
evidence of getting T based on our belief is: [0.22790697674418597, 0.211046511627907, 0

bash: line 1: fg: no job control


## Part 2

- Independent assumption makes naive bayes classifier so naive

- **Types Naive Bayes Classifiers**

    - Bernoulli: Used when outcome is 0 / 1. underlying distribution is Bernoulli. 

    - Gaussian: Used when outcome is real number with normal distribution assumption. 

    - Multinomial: Used when you repeat an experiment N number of times and outcome of the experiment is more than 2 possible values. [counting more than 2 possible outcomes]

- we can still use naive bayes classifier if the features are independent. We can assume that they are independent even though they are not, it still works well

## Part 3

In [6]:
grad = pd.read_csv('grad.csv')
grad.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [85]:
y = grad.admit
X = grad[['gre', 'gpa', 'rank']]

# test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 42)

# standard scalar
scalar = MinMaxScaler().fit(X_train)
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

# Logistic Regression
model1 = LogisticRegression(solver = 'lbfgs').fit(X_train, y_train)
print("Logistic Regression Score: ", accuracy_score(y_test, model1.predict(X_test)))

# Random Forest
model2 = RandomForestClassifier(n_estimators=4).fit(X_train, y_train)
print("Random Forest Score: ", accuracy_score(y_test, model2.predict(X_test)))

# Guassian Naive Bayes
model3 = GaussianNB().fit(X_train, y_train)
print('Guassian NB Score: ', accuracy_score(y_test, model3.predict(X_test)))

Logistic Regression Score:  0.725
Random Forest Score:  0.7
Guassian NB Score:  0.7083333333333334


  return self.partial_fit(X, y)


- Logistic regression performs well compared to other 2 models, depending on the run Random Forest can perform well but most of the time random forest performance is bad. 

## Part 4

In [143]:
data = pd.read_csv('~/galvanized/week9/twitter_sentiment/twitter-airline-sentiment/Tweets.csv')
text = data.pop('text')
label = data.pop('airline_sentiment')

In [144]:
len(text), len(label)

(14640, 14640)

In [148]:
type(text[0])

str

In [170]:
def myfunc(n):
    n = n.lower()
    return unicodedata.normalize('NFKD', n).encode('ASCII', 'ignore').decode('utf-8')

# normalize string after converting it into lower case using above function. 
normalize = list(map(myfunc, text))

# Remove stop words
sw = stopwords.words('english')
pt = string.punctuation
filtered = list(filter(lambda token: token not in sw and token not in pt, normalize))

# Stemming
stemmer_snowball = SnowballStemmer('english')
tokens_stemsnowball = list(map(stemmer_snowball.stem, filtered))

In [171]:
# Create bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(tokens_stemsnowball)
#print(vectorizer.get_feature_names())

In [172]:
# test train split
X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=0.30, random_state = 42)

### bag of words model accuracy

In [173]:
model3 = MultinomialNB().fit(X_train, y_train)
print('Multinomial NB Score: ', accuracy_score(y_test, model3.predict(X_test)))

Multinomial NB Score:  0.7745901639344263


### Knn clustering model accuracy

In [174]:
knn = KNeighborsClassifier(n_neighbors=7, algorithm='brute', metric = 'cosine')
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.7169854280510018

**Naive Bayes model performed better than Kmeans Model**