## Import modules

In [44]:

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import KNN as knn
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import SnowballStemmer
from num2words import num2words
from nltk.stem import WordNetLemmatizer 
import nltk


## Load data

In [47]:
data = pd.read_csv('train.txt', sep="\t", header=None).iloc[1:]
# test_data = pd.read_csv('test.txt', sep="\n")
test_data = np.loadtxt('test.txt',dtype='str')

data.columns = ["sentiment", "review"]
print("train data size: ",data.shape)
print("test data size: ",test_data.shape)

ValueError: could not convert string to float: '"This'

## Data at a glance

In [3]:
data["review"].iloc[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

## Data diversity

In [4]:
data.sentiment.value_counts()

-1    7609
1     7390
Name: sentiment, dtype: int64

## Remove Stop Words

In [5]:

stop = stopwords.words('english')
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


In [6]:
data["review"].iloc[0]

"One reviewers mentioned watching 1 Oz episode hooked. They right, exactly happened me.<br /><br />The first thing struck Oz brutality unflinching scenes violence, set right word GO. Trust me, show faint hearted timid. This show pulls punches regards drugs, sex violence. Its hardcore, classic use word.<br /><br />It called OZ nickname given Oswald Maximum Security State Penitentary. It focuses mainly Emerald City, experimental section prison cells glass fronts face inwards, privacy high agenda. Em City home many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish more....so scuffles, death stares, dodgy dealings shady agreements never far away.<br /><br />I would say main appeal show due fact goes shows dare. Forget pretty pictures painted mainstream audiences, forget charm, forget romance...OZ mess around. The first episode I ever saw struck nasty surreal, I say I ready it, I watched more, I developed taste Oz, got accustomed high levels graphic violence. Not violence, in

## Lemmatisation

In [7]:
lemmatizer = WordNetLemmatizer() 
data['review'] = data['review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

In [8]:
data["review"].iloc[0]

"One reviewer mentioned watching 1 Oz episode hooked. They right, exactly happened me.<br /><br />The first thing struck Oz brutality unflinching scene violence, set right word GO. Trust me, show faint hearted timid. This show pull punch regard drugs, sex violence. Its hardcore, classic use word.<br /><br />It called OZ nickname given Oswald Maximum Security State Penitentary. It focus mainly Emerald City, experimental section prison cell glass front face inwards, privacy high agenda. Em City home many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish more....so scuffles, death stares, dodgy dealing shady agreement never far away.<br /><br />I would say main appeal show due fact go show dare. Forget pretty picture painted mainstream audiences, forget charm, forget romance...OZ mess around. The first episode I ever saw struck nasty surreal, I say I ready it, I watched more, I developed taste Oz, got accustomed high level graphic violence. Not violence, injustice (crooked 

## Stemming

In [9]:
stemmer = SnowballStemmer("english") # Stem every word.
data['review'] = data['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [10]:
data["review"].iloc[0]

"one review mention watch 1 oz episod hooked. they right, exact happen me.<br /><br />the first thing struck oz brutal unflinch scene violence, set right word go. trust me, show faint heart timid. this show pull punch regard drugs, sex violence. it hardcore, classic use word.<br /><br />it call oz nicknam given oswald maximum secur state penitentary. it focus main emerald city, experiment section prison cell glass front face inwards, privaci high agenda. em citi home many..aryans, muslims, gangstas, latinos, christians, italians, irish more....so scuffles, death stares, dodgi deal shadi agreement never far away.<br /><br />i would say main appeal show due fact go show dare. forget pretti pictur paint mainstream audiences, forget charm, forget romance...oz mess around. the first episod i ever saw struck nasti surreal, i say i readi it, i watch more, i develop tast oz, got accustom high level graphic violence. not violence, injustic (crook guard who'll sold nickel, inmat who'll kill orde

## Remove using Regex

In [11]:
data["review"] = data['review'].str.replace('[^\w\s]','')

In [12]:
data["review"].iloc[0]

'one review mention watch 1 oz episod hooked they right exact happen mebr br the first thing struck oz brutal unflinch scene violence set right word go trust me show faint heart timid this show pull punch regard drugs sex violence it hardcore classic use wordbr br it call oz nicknam given oswald maximum secur state penitentary it focus main emerald city experiment section prison cell glass front face inwards privaci high agenda em citi home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgi deal shadi agreement never far awaybr br i would say main appeal show due fact go show dare forget pretti pictur paint mainstream audiences forget charm forget romanceoz mess around the first episod i ever saw struck nasti surreal i say i readi it i watch more i develop tast oz got accustom high level graphic violence not violence injustic crook guard wholl sold nickel inmat wholl kill order get away it well mannered middl class inmat turn prison bitch d

## Remove numbers 

In [13]:
data["review"] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if not word.isnumeric()]))

In [14]:
data["review"].iloc[0]

'one review mention watch oz episod hooked they right exact happen mebr br the first thing struck oz brutal unflinch scene violence set right word go trust me show faint heart timid this show pull punch regard drugs sex violence it hardcore classic use wordbr br it call oz nicknam given oswald maximum secur state penitentary it focus main emerald city experiment section prison cell glass front face inwards privaci high agenda em citi home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgi deal shadi agreement never far awaybr br i would say main appeal show due fact go show dare forget pretti pictur paint mainstream audiences forget charm forget romanceoz mess around the first episod i ever saw struck nasti surreal i say i readi it i watch more i develop tast oz got accustom high level graphic violence not violence injustic crook guard wholl sold nickel inmat wholl kill order get away it well mannered middl class inmat turn prison bitch due

## Remove min words

In [15]:
data["review"] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if (len(word)>2)]))

In [16]:
data["review"].iloc[0]

'one review mention watch episod hooked they right exact happen mebr the first thing struck brutal unflinch scene violence set right word trust show faint heart timid this show pull punch regard drugs sex violence hardcore classic use wordbr call nicknam given oswald maximum secur state penitentary focus main emerald city experiment section prison cell glass front face inwards privaci high agenda citi home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgi deal shadi agreement never far awaybr would say main appeal show due fact show dare forget pretti pictur paint mainstream audiences forget charm forget romanceoz mess around the first episod ever saw struck nasti surreal say readi watch more develop tast got accustom high level graphic violence not violence injustic crook guard wholl sold nickel inmat wholl kill order get away well mannered middl class inmat turn prison bitch due lack street skill prison experience watch may becom comfort

## Lets do tf-idf

In [17]:
v = TfidfVectorizer(use_idf=True)
tfIdf = v.fit_transform(data['review'])
df = pd.DataFrame(tfIdf.T.toarray(), index=v.get_feature_names())
# df = df.sort_values('TF-IDF', ascending=False)



In [18]:
# print(v.get_feature_names())
# df = df.sort_values(v.get_feature_names(),ascending=False)


## Split the data-set 

In [19]:
c = df.T
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(x, np.asarray(y), test_size=0.23, random_state=42)


## Lets Use the KNN(Implemented from Scratch)

In [20]:
# model = knn.KNN(25,X_train,y_train.reshape(-1,1))

In [21]:
# print(X_train.shape)
# print(X_test.shape)

In [22]:
# Y_pred = model.forward(X_test)

In [23]:
# from sklearn.metrics import accuracy_score
# print(accuracy_score(y_test,Y_pred))

0.7834782608695652


## Train on whole set

In [40]:
model = knn.KNN(25,x,np.asarray(y).reshape(-1,1))
Y_pred = model.forward(test_data)


TypeError: __init__() got an unexpected keyword argument 'error_bad_lines'

## Save model output

In [30]:

with open('output.txt', mode='wt', encoding='utf-8') as myfile:
    myfile.write('\n'.join(results[0] for results in Y_pred))

