In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
data= pd.read_table('/content/train.tsv')

In [3]:
data

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [4]:
np.unique(data['Sentiment'])

array([0, 1, 2, 3, 4])

In [5]:
data['Phrase']

0         A series of escapades demonstrating the adage ...
1         A series of escapades demonstrating the adage ...
2                                                  A series
3                                                         A
4                                                    series
                                ...                        
156055                                            Hearst 's
156056                            forced avuncular chortles
156057                                   avuncular chortles
156058                                            avuncular
156059                                             chortles
Name: Phrase, Length: 156060, dtype: object

In [6]:
data['Phrase'] = data['Phrase'].apply(lambda x : str(x.lower()))

In [7]:
from string import punctuation
def punctuation_removal(val):
  return "".join(c for c in val if c not in punctuation)

In [8]:
data['Phrase'] = data['Phrase'].apply(lambda x : punctuation_removal(x))

In [9]:
data['Phrase']

0         a series of escapades demonstrating the adage ...
1         a series of escapades demonstrating the adage ...
2                                                  a series
3                                                         a
4                                                    series
                                ...                        
156055                                             hearst s
156056                            forced avuncular chortles
156057                                   avuncular chortles
156058                                            avuncular
156059                                             chortles
Name: Phrase, Length: 156060, dtype: object

In [10]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
from nltk.stem.porter import *
porter= PorterStemmer()

In [12]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
def stop_word(val):
  tokens = word_tokenize(val)
  filtered_sentence = [c for c in tokens if c not in stop_words]
  return " ".join(c for c in filtered_sentence)

In [14]:
def stemming(val):
  tokens = word_tokenize(val)
  filtered_sentence = [porter.stem(c) for c in tokens]
  return " ".join(c for c in filtered_sentence)

In [15]:
data['Phrase'] = data['Phrase'].apply(lambda x : stop_word(x))

In [16]:
data['Phrase'] = data['Phrase'].apply(lambda x : stemming(x))

In [17]:
data['Phrase']

0         seri escapad demonstr adag good goos also good...
1                      seri escapad demonstr adag good goos
2                                                      seri
3                                                          
4                                                      seri
                                ...                        
156055                                               hearst
156056                                forc avuncular chortl
156057                                     avuncular chortl
156058                                            avuncular
156059                                               chortl
Name: Phrase, Length: 156060, dtype: object

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(min_df=2, ngram_range=(1, 1))
X_train = vect.fit(data['Phrase']).transform(data['Phrase']) 


In [19]:
X_train.shape

(156060, 11836)

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [21]:
lr = LogisticRegression(max_iter=300)
lr.fit(X_train,data['Sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
lr.score(X_train,data['Sentiment'])

0.6948096885813149

In [23]:
test_data= pd.read_table('/content/test.tsv')

In [24]:
train_predict = lr.predict(X_train)

In [25]:
train_predict

array([2, 2, 2, ..., 2, 2, 2])

In [26]:
from sklearn.metrics import classification_report,confusion_matrix

In [27]:
confusion_matrix(data['Sentiment'],train_predict)

array([[ 2712,  2872,  1337,   135,    16],
       [  828, 13153, 12269,   967,    56],
       [  212,  3658, 71245,  4297,   170],
       [   32,   741, 13381, 17523,  1250],
       [    7,    92,  1134,  4174,  3799]])

In [28]:
classification_report(data['Sentiment'],train_predict)

'              precision    recall  f1-score   support\n\n           0       0.72      0.38      0.50      7072\n           1       0.64      0.48      0.55     27273\n           2       0.72      0.90      0.80     79582\n           3       0.65      0.53      0.58     32927\n           4       0.72      0.41      0.52      9206\n\n    accuracy                           0.69    156060\n   macro avg       0.69      0.54      0.59    156060\nweighted avg       0.69      0.69      0.68    156060\n'

In [29]:
test_data

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine
...,...,...,...
66287,222348,11855,"A long-winded , predictable scenario ."
66288,222349,11855,"A long-winded , predictable scenario"
66289,222350,11855,"A long-winded ,"
66290,222351,11855,A long-winded


In [30]:
test_data['Phrase'] = test_data['Phrase'].apply(lambda x : str(x.lower()))
test_data['Phrase'] = test_data['Phrase'].apply(lambda x : punctuation_removal(x))
test_data['Phrase'] = test_data['Phrase'].apply(lambda x : stop_word(x))
test_data['Phrase'] = test_data['Phrase'].apply(lambda x : stemming(x))

In [31]:
test_x = vect.transform(test_data['Phrase']) 

In [32]:
predict = lr.predict(test_x)

In [33]:
predict

array([3, 3, 2, ..., 1, 1, 1])

In [34]:
ids = [i for i in range(156061,222353)]

In [35]:
vals = list(zip(ids,predict))

In [36]:
data_frame = pd.DataFrame(vals,columns=['PhraseId','Sentiment'])

In [37]:
X_train

<156060x11836 sparse matrix of type '<class 'numpy.int64'>'
	with 609679 stored elements in Compressed Sparse Row format>

In [38]:
data_frame

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,3
4,156065,2
...,...,...
66287,222348,1
66288,222349,1
66289,222350,1
66290,222351,1


In [39]:
files = data_frame.to_csv('submission.csv',index=False)