# Importing Libraries

In [None]:
import numpy as np
import pandas as pd

# Importing Dataset

In [None]:
dataset = pd.read_csv('../input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip',delimiter="\t")

In [None]:
dataset.head()

# Data Cleaning

In [None]:
import re
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
corpus = []
for i in range(0,len(dataset)):
    letters_only = BeautifulSoup(dataset['review'][i]).get_text()
    reviews = re.sub('[^a-zA-Z]',' ',letters_only) #replaced punctuations with spaces
    reviews = reviews.lower()
    reviews = reviews.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    reviews = [ps.stem(word) for word in reviews if not word in set(all_stopwords)]
    reviews = ' '.join(reviews)
    corpus.append(reviews)
    

# Splitting Data into train set and test set

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1250)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

# Training the Logistic Model on Training set

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train,y_train)

In [None]:
y_pred = classifier.predict(x_test)
np.concatenate((y_test.reshape((len(y_test),1)),y_pred.reshape((len(y_pred),1))),1)

# Predicting the test result

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report

print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

# ROC Curve

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(y_test,  y_pred)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Submit the prediction

In [None]:
testset = pd.read_csv('../input/word2vec-nlp-tutorial/testData.tsv.zip',delimiter="\t")

In [None]:
corpus = []
for i in range(0,len(testset)):
    letters_only = BeautifulSoup(testset['review'][i]).get_text()
    reviews = re.sub('[^a-zA-Z]',' ',letters_only) #replaced punctuations with spaces
    reviews = reviews.lower()
    reviews = reviews.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    reviews = [ps.stem(word) for word in reviews if not word in set(all_stopwords)]
    reviews = ' '.join(reviews)
    corpus.append(reviews)
    

In [None]:
new_x_test = cv.transform(corpus).toarray()
new_y_pred = classifier.predict(new_x_test)
new_y_pred

In [None]:
output = pd.DataFrame(data={'id':testset['id'], 'sentiment':new_y_pred})

In [None]:
output.to_csv('submission.csv',index=False)