## Combining all the tokens into one and ordered by the sentence ID

In [7]:
#basic libraries
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import math
import pandas as pd
import re

In [8]:
#text processing libraries
import nltk

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer


#stopwords
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from string import punctuation
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [9]:
data = pd.read_csv('./train.csv')
data.fillna('', inplace=True)

data['tokenized_text'] = " "
data['tokenized_key'] = " "

In [10]:
stop_words = set(stopwords.words('english'))
punctuation = list(punctuation)
gensimwords = STOPWORDS
sklearnwords = ENGLISH_STOP_WORDS
num_pattern = r'[0-9]'

In [11]:

porter = PorterStemmer()
def tokenized_stop(string):
    string = re.sub(num_pattern, '', string)
    string = re.sub(r'http\S+', '', string)
    
    #tokenizing the words
    string = word_tokenize(string)
    
    #ignoring the unnecessary words
    string_list = []
    for words in string:
        words = words.casefold()
        if (words in stop_words) or (words in punctuation) or (words in gensimwords) or (words in sklearnwords):
            pass
        else:
            words = porter.stem(words)
            string_list.append(words)
             
    return string_list



In [12]:
for i in range(len(data['text'])):
    data.at[i,'tokenized_text'] = tokenized_stop(data['text'][i])
    data.at[i,'tokenized_key'] = tokenized_stop(data['keyword'][i])

In [13]:

porter = PorterStemmer()
def tokenized_stop(string):
    string = re.sub(num_pattern, '', string)
    string = re.sub(r'http\S+', '', string)
    
    #tokenizing the words
    string = word_tokenize(string)
    
    #ignoring the unnecessary words
    string_list = []
    for words in string:
        words = words.casefold()
        if (words in stop_words) or (words in punctuation) or (words in gensimwords) or (words in sklearnwords):
            pass
        else:
            words = porter.stem(words)
            string_list.append(words)
             
    return string_list



In [14]:
stop_words = set(stopwords.words('english'))
punctuation = list(punctuation)
gensimwords = STOPWORDS
sklearnwords = ENGLISH_STOP_WORDS
num_pattern = r'[0-9]'

In [15]:
all_sents=[]
for i in range(len(data['tokenized_text'])):
    string=''
    for j in data['tokenized_text'][i]:
        string = string  + j + ' '
    all_sents.append(string)


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
#vectorizer.fit(all_words)
X_train_counts = vectorizer.fit_transform(all_sents)

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [18]:
x_train = all_sents
y_train = data['target']


In [19]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer()),\
                     ('tfidf', TfidfTransformer()),\
                     ('clf', LogisticRegression(penalty = 'l2',C=0.01, max_iter=1000, solver = 'lbfgs',random_state=20 ))])

In [20]:
parameters = {'tfidf__use_idf': (True, False),\
               'clf__penalty': ('l1','l2','elasticnet'),\
                'clf__C': (1,0.1,0.01,0.001),\
             'clf__max_iter': (10,100,1000),\
             'clf__solver': ('lbfgs','lblinear','sag','newton-cg') }

In [21]:
grid_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [22]:
grid_clf = grid_clf.fit(x_train, y_train)

        nan        nan 0.69408164 0.69014256        nan        nan
 0.69854901 0.68922341 0.69907446 0.68961711        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan 0.69894314 0.68974843        nan        nan
 0.69907446 0.68974843 0.69907446 0.68974843        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan 0.69894314 0.68974843        nan        nan
 0.69907446 0.68974843 0.69907446 0.68974843        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan 0.66058414 0.67306452        nan        nan
 0.66045282 0.67319584 0.66071546 0.67306452        nan        nan
        nan        nan        nan        nan        nan       

In [23]:
print(grid_clf.best_score_)
print(grid_clf.best_params_)

0.699074463137714
{'clf__C': 1, 'clf__max_iter': 10, 'clf__penalty': 'l2', 'clf__solver': 'newton-cg', 'tfidf__use_idf': True}


In [24]:
test_data = pd.read_csv('./test.csv')
test_data.fillna('', inplace=True)

test_data['tokenized_text'] = " "
test_data['tokenized_key'] = " "



In [25]:
for i in range(len(test_data['text'])):
    test_data.at[i,'tokenized_text'] = tokenized_stop(test_data['text'][i])
    test_data.at[i,'tokenized_key'] = tokenized_stop(test_data['keyword'][i])

In [26]:
all_sents=[]
for i in range(len(test_data['tokenized_text'])):
    string=''
    for j in test_data['tokenized_text'][i]:
        string = string  + j + ' '
    all_sents.append(string)

In [27]:
x_test = all_sents
y_test = test_data['target']


In [28]:
predicted = grid_clf.predict(x_test)