## Import libraries

In [15]:
import nltk
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

## Import the TSV files (tab-separated values)

In [2]:
data_df = pd.read_csv('train.tsv', sep='\t')
input_df = pd.read_csv('test.tsv', sep='\t')

In [3]:
data_df.describe()

Unnamed: 0,PhraseId,SentenceId,Sentiment
count,156060.0,156060.0,156060.0
mean,78030.5,4079.732744,2.063578
std,45050.785842,2502.764394,0.893832
min,1.0,1.0,0.0
25%,39015.75,1861.75,2.0
50%,78030.5,4017.0,2.0
75%,117045.25,6244.0,3.0
max,156060.0,8544.0,4.0


In [4]:
input_df.describe()

Unnamed: 0,PhraseId,SentenceId
count,66292.0,66292.0
mean,189206.5,10114.909144
std,19136.99636,966.787807
min,156061.0,8545.0
25%,172633.75,9266.0
50%,189206.5,10086.0
75%,205779.25,10941.0
max,222352.0,11855.0


In [5]:
data_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [6]:
input_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [7]:
X_train = data_df['Phrase']
y_train = data_df['Sentiment']

## Use a TF-IDF vectorizer with unigram and bigram features, removing the English NLTK stopwords and maximum number of features is 25

In [16]:
feature_count = 25
vect = TfidfVectorizer(max_features=25, stop_words=nltk.corpus.stopwords.words('english'), ngram_range=(1, 2))

## Use a simple Logistic Regression model

In [17]:
model = LogisticRegression(verbose=True)
pipe = make_pipeline(vect, model)
pipe.fit(X_train, y_train)

[LibLinear]

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=25, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf...'l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=True, warm_start=False))])

In [18]:
print(accuracy_score(pipe.predict(X_train), y_train))

0.5185313341022684


In [19]:
input_df['Sentiment'] = pipe.predict(input_df['Phrase'])
input_df[['PhraseId', 'Sentiment']].to_csv('result.csv', index=False)