In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import zipfile # to read zip files
from sklearn.model_selection import train_test_split


# data understanding libraries
import matplotlib.pyplot as plt # ploting library
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import Counter


# data preparation
import re
from nltk.stem import PorterStemmer


# ADS Creation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler

# Modeling
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB

# Evaluation and Model Selection
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn import metrics
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision',150)
pd.options.display.float_format = '{:,.3f}'.format

# Movie Problem

* What is the review of specific Phrases ?
* Each row represent a phrase
* according to sentiment from 0 to 4

In [None]:
archive_train = zipfile.ZipFile('../input/sentiment-analysis-on-movie-reviews/train.tsv.zip')

train = pd.read_csv("../input/sentiment-analysis-on-movie-reviews/train.tsv.zip", sep='\t')
train.head(15)

In [None]:
train_data, test_data = train_test_split(train, test_size=0.2, random_state=1)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=1)

train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [None]:
print("Train set size is ",len(train_data))
print("Val set size is ",len(val_data))
print("Test set size is ",len(test_data))

# Data Understanding

* ## Data structure

In [None]:
train_data.info()

The main takeaways are :
* there is 124848  record
* there is no null data in all columns
* only the phrase is string

In [None]:
words=" ".join([row["Phrase"] for ind,row in train_data.iterrows() ]).split()

In [None]:
words_set=set(words)
len(words_set)


## 2.3 What is the frequency of each Sentiment?

In [None]:
labels = train_data['Sentiment'].unique()

In [None]:
# plot the graph
fig, ax = plt.subplots(figsize=(15,10)) # create the plot and specify the figure size
plt.xlabel('Sentiment') # specify the x labels
plt.ylabel('Frequency') # specify the y labels
plt.title('Frequency of Sentiment') # specify the plot title
plt.bar(labels,train_data['Sentiment'].value_counts()) # create a bar plot
plt.xticks(rotation=0) # rotate the x labels
plt.grid() # show the grid
plt.show() # show the final plot


# How many ingredients per recipes we have?

In [None]:
# add column with num of word per phrase
train_data['Phrase_num'] = train_data["Phrase"].apply(len)

# save list of the unique numbers we have
numbers = train_data["Phrase"].apply(len).unique()

In [None]:
fig, ax = plt.subplots(figsize=(30,10))
plt.bar(numbers,train_data["Phrase_num"].value_counts().sort_index())
plt.xlabel('word Count')
plt.ylabel('Number of phrase')
plt.title('Number of word per phrase Count')
plt.xticks(np.arange(min(numbers), max(numbers)+1, 4)) # change x labels from the defult to the given range
plt.grid()
plt.show()

In [None]:
bins = range(0,180,5)

fig, ax = plt.subplots(figsize=(20,10))
plt.hist(train_data['Phrase_num'], bins=bins, edgecolor="k") # output a histogram plot
plt.xlabel('Word Count')
plt.ylabel('Number of Phrases')
plt.title('Number of word per phrase Count')
plt.xticks(bins) # change x labels from the defult to the given range
plt.grid()
plt.show()

> Now this have a more obvious information. The idea is not only about analysing the data, it's about how to best describe the data.

> The main takeaways are:

> almost 80k out of the 100k phrase have between 2 to 60 Word. (~80% of the data)

> Phrase more than 130 word is outliers

In [None]:
print("There are " ,len(train_data[train_data["Phrase_num"]>130]), " phrases with word more than 130.")

In [None]:
train_data[train_data["Phrase_num"]>130][['Phrase']]

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
train_data[train_data["Phrase_num"]>=130].groupby(['Sentiment']).size().sort_values().plot(kind='barh', ax=ax)
plt.title('Distribution of word with phrase >130 over sentiment')
plt.ylabel('Sentiment')
plt.xlabel('Number of word')
plt.grid()
plt.show()


# What is the frequancy of the word?

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
lst = Counter(words).most_common(15)
df = pd.DataFrame(lst, columns = ['words', 'Count'])
df.plot.bar(x='words',y='Count', ax=ax)
plt.title('15 Most Frequent Ingredient')
plt.ylabel('Frequency')
plt.xlabel('word')
plt.show()

In [None]:
wordcloud = WordCloud(width = 1000, height = 500).generate(' '.join(words))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.title("Most Used word")
plt.axis("off")
plt.show()

# What is the frequancy of the words per sedimenent?

In [None]:
train_data['SplitPhrase']=train_data['Phrase'].str.split()
train_data['SplitPhrase']

In [None]:

counters = {}
for Sentiment in train_data['Sentiment'].unique():
    counters[Sentiment] = Counter()
    indices = (train_data['Sentiment'] == Sentiment)
    for SplitPhrase in train_data[indices]['SplitPhrase']:
        counters[Sentiment].update(SplitPhrase)

fig, axes = plt.subplots(1, 5, figsize=(20, 8),sharex='col', sharey='row')
for Sentiment, ax_index in zip(counters, range(1,21)): 
    wordcloud = WordCloud(background_color="white")
    wordcloud.generate_from_frequencies(frequencies=counters[Sentiment])
    fig.add_subplot(1, 5, ax_index)    
    plt.title(Sentiment)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")

# 2. Data Preparation

## 2.1 Data Cleansing

In [None]:
train_data.head(30)

In [None]:
pd.Series([s for s in words if "-" in s]).unique()

In [None]:
pd.Series([s for s in words if "'" in s]).unique()

In [None]:
pd.Series([s for s in words if any(char.isdigit() for char in s)]).unique()

In [None]:
pd.Series([s for s in words if "," in s]).unique()

In [None]:
pd.Series([s for s in words if "." in s]).unique()

In [None]:
pd.Series([s for s in words if re.findall('[^a-zA-Z]',re.sub(r'[^\w\s]','',s))]).unique()

What is need to be cleaned?

* lower and upper case data.
* dashed data
* dots , comas , numbers

In [None]:
train_data.head()

In [None]:

porter = PorterStemmer()
# lancaster=LancasterStemmer()

def ret_words(SplitPhrase):
    word_text=' '.join(SplitPhrase)
    word_text = word_text.replace('-', ' ')
    word_text = word_text.replace('.', '')
    word_text = word_text.replace(',', '')
    word_text= word_text.lower()

    final=[]
    for ana in word_text.split():
        if re.findall('[0-9]', ana): continue
        if re.findall('[^a-zA-Z]',re.sub(r'[^\w\s]','',ana)): continue
        if len(ana) > 0: final.append(porter.stem(re.sub(r'[^\w\s]','',ana)))
    return ' '.join(final)

def preprocess(df,flag):

    # Convert list of ingredients to string
    df['words'] = df['Phrase'].str.split().apply(ret_words)
    
    return df

In [None]:
train_preprocessed = preprocess(train_data,0)
val_preprocessed = preprocess(val_data,1)
test_preprocessed = preprocess(test_data,1)


In [None]:
train_preprocessed.head(100)

In [None]:
len(set(pd.Series(' '.join([row["words"] for ind,row in train_preprocessed.iterrows()]).split(' '))))

## Sperate the data

In [None]:
id_train, X_train, y_train = train_preprocessed['PhraseId'], train_preprocessed['words'], train_preprocessed['Sentiment']
id_test, X_test, y_test = test_preprocessed['PhraseId'], test_preprocessed['words'], test_preprocessed['Sentiment']

## ADS Creation

In [None]:
# BoW
BoW = CountVectorizer()

BoW.fit(X_train)
Count_data = BoW.transform(X_train)

BoW_X_train = pd.DataFrame(Count_data.toarray(),columns=BoW.get_feature_names())

BoW_X_train

In [None]:
X_train.head()

In [None]:
BoW.fit(X_train.head())
Count_data = BoW.transform(X_train.head())
BoW_X_train = pd.DataFrame(Count_data.toarray(),columns=BoW.get_feature_names())
BoW_X_train

## TFIDF

In [None]:
# TFIDF
TFIDF = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.25, norm='l2', encoding='latin-1',\
                ngram_range=(1, 2), stop_words='english')

TFIDF.fit(X_train)
Count_data = TFIDF.transform(X_train)
TFIDF_X_train = pd.DataFrame(Count_data.toarray(),columns=TFIDF.get_feature_names())


TFIDF_X_train

In [None]:
X_train.head(5)

In [None]:
TFIDF = TfidfVectorizer()
TFIDF.fit(X_train.head(5))
Count_data = TFIDF.transform(X_train.head(5))
TFIDF_X_train = pd.DataFrame(Count_data.toarray(),columns=TFIDF.get_feature_names())


TFIDF_X_train

# Modeling

In [None]:
id_train, X_train, y_train = train_preprocessed['PhraseId'], train_preprocessed['words'], train_preprocessed['Sentiment']
id_val, X_val, y_val = val_preprocessed['PhraseId'], val_preprocessed['words'], val_preprocessed['Sentiment']
id_test, X_test, y_test = test_preprocessed['PhraseId'], test_preprocessed['words'], test_preprocessed['Sentiment']

BoW

In [None]:
LR_clf_counts = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression(random_state=0, max_iter=2000))
])
LR_clf_counts.fit(X_train, y_train)
LR_cnt_pred_tr = LR_clf_counts.predict(X_train)

print(accuracy_score(y_train, LR_cnt_pred_tr))
print(precision_score(y_train, LR_cnt_pred_tr, average='weighted'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(LR_clf_counts, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
SVM_clf_counts = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LinearSVC(max_iter=3000))
])
SVM_clf_counts.fit(X_train, y_train)
SVM_cnt_pred_tr = SVM_clf_counts.predict(X_train)

print(accuracy_score(y_train, SVM_cnt_pred_tr))
print(precision_score(y_train, SVM_cnt_pred_tr, average='weighted'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(SVM_clf_counts, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
NB_clf_counts = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])
NB_clf_counts.fit(X_train, y_train)
NB_cnt_pred_tr = NB_clf_counts.predict(X_train)

print(accuracy_score(y_train, NB_cnt_pred_tr))
print(precision_score(y_train, NB_cnt_pred_tr, average='weighted'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(NB_clf_counts, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

### TFIDF

In [None]:
LR_clf_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.25, norm='l2', encoding='latin-1',ngram_range=(1, 2), stop_words='english')),
    ('clf', LogisticRegression(random_state=0, max_iter=2000))
])
LR_clf_tfidf.fit(X_train, y_train)
LR_tfidf_pred_tr = LR_clf_tfidf.predict(X_train)

print(accuracy_score(y_train, LR_tfidf_pred_tr))
print(precision_score(y_train, LR_tfidf_pred_tr, average='weighted'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(LR_clf_tfidf, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
SVM_clf_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.25, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')),
    ('clf', LinearSVC( max_iter=2000))
])
SVM_clf_tfidf.fit(X_train, y_train)
SVM_tfidf_pred_tr = SVM_clf_tfidf.predict(X_train)

print(accuracy_score(y_train, SVM_tfidf_pred_tr))
print(precision_score(y_train, SVM_tfidf_pred_tr, average='weighted'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(SVM_clf_tfidf, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
NB_clf_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.25, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')),
    ('clf', MultinomialNB())
])
NB_clf_tfidf.fit(X_train, y_train)
NB_tfidf_pred_tr = NB_clf_tfidf.predict(X_train)

print(accuracy_score(y_train, NB_tfidf_pred_tr))
print(precision_score(y_train, NB_tfidf_pred_tr, average='weighted'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(NB_clf_tfidf, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

# Hyperparameter tuning

In [None]:
vect=  CountVectorizer()
X_train_cnt = vect.fit_transform(X_train)

# Logistic Regression

In [None]:
def LR_param_selection(X, y, nfolds):
    Cs = [0.01, 0.1, 1, 10]
    param_grid = {'C': Cs}
    grid_search = GridSearchCV(LogisticRegression(random_state=0,max_iter=2000), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [None]:
LR_param_selection( X_train_cnt,y_train,2)

## final model

In [None]:
LR_clf_counts = Pipeline([('vect', CountVectorizer()),
                   ('clf', LogisticRegression(C=1,random_state=0, max_iter=2000)),
                  ])
LR_clf_counts.fit(X_train, y_train)
LR_cnt_pred_tr = LR_clf_counts.predict(X_train)
LR_cnt_pred_val = LR_clf_counts.predict(X_val)
LR_cnt_pred_tst = LR_clf_counts.predict(X_test)


print("precision on training: ",precision_score(y_train, LR_cnt_pred_tr, average='micro'))
print("precision on validation: ",precision_score(y_val, LR_cnt_pred_val, average='micro'))
print("precision on testing: ",precision_score(y_test, LR_cnt_pred_tst, average='micro'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(LR_clf_counts, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
archive_train = zipfile.ZipFile('../input/sentiment-analysis-on-movie-reviews/train.tsv.zip')
archive_test = zipfile.ZipFile('../input/sentiment-analysis-on-movie-reviews/test.tsv.zip')

In [None]:
final_train = pd.read_csv("../input/sentiment-analysis-on-movie-reviews/train.tsv.zip", sep='\t')
final_test = pd.read_csv("../input/sentiment-analysis-on-movie-reviews/test.tsv.zip", sep='\t')

In [None]:
ftrain_preprocessed = preprocess (final_train,0)
ftest_preprocessed = preprocess (final_test,1)

In [None]:
id_train, X_train, y_train = ftrain_preprocessed['PhraseId'], ftrain_preprocessed['words'], ftrain_preprocessed['Sentiment']
id_test, X_test= ftest_preprocessed['PhraseId'], ftest_preprocessed['words']

In [None]:
LR_clf = Pipeline([('vect', CountVectorizer()),
                   ('clf', LogisticRegression(C=1,random_state=0, max_iter=2000)),
                  ])
LR_clf.fit(X_train , y_train)
pred_tst = LR_clf_counts.predict(X_test)

In [None]:
output=pd.DataFrame({'PhraseId' : id_test , 'Sentiment' : pred_tst })
output.to_csv('Sentiment_preds_LR.csv' , index=False)