## Naive Bayes

In [5]:
with open("./sentiment-analysis-datasets/imdb_labelled.txt", "r") as text_file:
    lines = text_file.read().split('\n')
with open("./sentiment-analysis-datasets/yelp_labelled.txt", "r") as text_file:
    lines += text_file.read().split('\n')
with open("./sentiment-analysis-datasets/amazon_cells_labelled.txt", "r") as text_file:
    lines += text_file.read().split('\n')

In [9]:
lines = [line.split("\t") for line in lines if len(line.split("\t"))==2 and line.split("\t")[1]!='']

In [10]:
lines[:5]

[['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ',
  '0'],
 ['Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ',
  '0'],
 ['Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  ',
  '0'],
 ['Very little music or anything to speak of.  ', '0'],
 ['The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  ',
  '1']]

In [11]:
train_documents = [line[0] for line in lines]
train_labels = [int(line[1]) for line in lines]


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
# This will basically cerate a term frequency matrix from the training documents

In [13]:
count_vectorizer = CountVectorizer(binary='True')
train_documents = count_vectorizer.fit_transform(train_documents)

In [14]:
train_documents # 3000 rows, 5155 len of vocabulary

<3000x5155 sparse matrix of type '<class 'numpy.int64'>'
	with 31578 stored elements in Compressed Sparse Row format>

In [16]:
print(train_documents[0])

  (0, 4890)	1
  (0, 4133)	1
  (0, 2956)	1
  (0, 166)	1
  (0, 2954)	1
  (0, 75)	1
  (0, 1331)	1
  (0, 1401)	1
  (0, 5139)	1
  (0, 2764)	1


In [17]:
# Training

In [18]:
from sklearn.naive_bayes import BernoulliNB

classifier = BernoulliNB().fit(train_documents, train_labels)

In [19]:
# Test phase

In [21]:
classifier.predict(count_vectorizer.transform(["This is a great movie"]))

array([1])

In [23]:
classifier.predict(count_vectorizer.transform(["Worst movie"]))

array([0])

## SVM

* Every image is represented as poitns in a N-dimensional hypercube. (tuple of len = n)

In [24]:
# Every Image is already turned into a n dimensional tuple in the dataset

In [25]:
import pandas as pd 
import numpy as np

In [26]:
dataFile = "./ad-dataset/ad.data"
data = pd.read_csv(dataFile, sep=",", header=None, low_memory=False)

In [28]:
data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558
0,125,125,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
1,57,468,8.2105,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
2,33,230,6.9696,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
3,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
4,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
5,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
6,59,460,7.7966,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
7,60,234,3.9,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
8,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
9,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.


In [32]:
# Check if a given value is  amissing value and convert it to NaN
def toNum(cell):
    try: 
        return float(cell)
    except:
        return np.nan

# Applying missing value check to a column/ Pandas series
def seriestoNum(series):
    return series.apply(toNum)

In [33]:
train_data = data.iloc[0:,0:-1].apply(seriestoNum)

In [34]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1548,1549,1550,1551,1552,1553,1554,1555,1556,1557
0,125.0,125.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,57.0,468.0,8.2105,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,33.0,230.0,6.9696,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,60.0,468.0,7.8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,60.0,468.0,7.8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# Dropping all the NaN values
train_data.dropna(inplace=True)

In [36]:
def toLabel(str):
    return 1 if str=="ad." else 0

In [37]:
train_labels = data.iloc[train_data.index,-1].apply(toLabel)
train_labels

0       1
1       1
2       1
3       1
4       1
       ..
3273    0
3274    0
3275    0
3276    0
3278    0
Name: 1558, Length: 2359, dtype: int64

In [39]:
# Training Phase
from sklearn.svm import LinearSVC

clf = LinearSVC().fit(train_data[100:2300], train_labels[100:2300])




In [43]:
# Test Phase
clf.predict(train_data.iloc[12].values.reshape(1,-1))

array([1])