In [22]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import joblib

In [2]:
# Reading data
train_data = pd.read_csv('../data/training_data.tsv',header=0,delimiter="\t" ,quoting=3)

In [3]:
# Function to define if there is reminder 
def found(x):
    if (x[0] == "Not Found"):
        return "Not Found"
    else:
        return "Found"

In [4]:
#Applying the found function
train_data['label_found'] = train_data[['label']].apply( found , axis = 1)

In [5]:
#This Function help us to remove the Digits and Emojis from the data
def clean_txt(texts):
    letters = re.sub("[^a-zA-Z]",' ', str(texts))
    lower_case = letters.lower()
    words = lower_case.split()
    stopword = stopwords.words('english')
    meaning_words = [w for w in words if not w in stopword]
    return (" ".join(meaning_words))

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\JOY
[nltk_data]     SINHA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
#remove the Digits and Emojis from the data
train_data['sent_clean'] = [clean_txt(review) for review in train_data["sent"].values]

In [9]:
#Loading test data as test_data
test_data = pd.read_csv('../data/eval_data.txt',header=0,delimiter="\t" ,quoting=3)

In [10]:
#remove the Digits and Emojis from the data
test_data['sent_clean'] = [clean_txt(review) for review in test_data["sent"].values]

In [11]:
#Show the Data
test_data.sample(10)

Unnamed: 0,sent,sent_clean
93,Can you set me alarms,set alarms
815,Please schedule it for everyday,please schedule everyday
81,Can you please remind me for a vaccination at ...,please remind vaccination pm
939,I need 2 calls one fr 7.00 am n another for 7....,need calls one fr n another
763,"Surely,Thanks. The same will be for the Langua...",surely thanks language reminder
428,Can you remind me to send quote for incastt ma...,remind send quote incastt machines
665,Please close the reminder,please close reminder
327,Remind me at 2:42 pm today,remind pm today
389,i would like this reminder every day,would like reminder every day
494,Remind me to take a cake home at 7pm today,remind take cake home pm today


In [12]:
#Convert a collection of text documents to a matrix of token counts
#Bow train
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, max_features = 12000) 
bow_train = (vectorizer.fit_transform(train_data['sent_clean'])).toarray()
bow_test = (vectorizer.transform(test_data['sent_clean'])).toarray()

In [21]:
#Showing Data
train_data.sample(30)

Unnamed: 0,sent,label,label_found,sent_clean
2659,Remind me today at 7:00 also ok?,Not Found,Not Found,remind today also ok
6919,You need to remind me from 6 am to 10 pm,Not Found,Not Found,need remind pm
9026,Reminder for shaving ?,shaving,Found,reminder shaving
7867,About meeting boss at 9,meeting boss,Found,meeting boss
6161,Set me a reminder,Not Found,Not Found,set reminder
7929,Thank you for your remainder,Not Found,Not Found,thank remainder
1901,Can i have it scheduled on every weekday?,Not Found,Not Found,scheduled every weekday
6994,Can you set reminder at 10:00 am for Wipro work,Wipro work,Found,set reminder wipro work
393,Thanx for reminder swapnil,Not Found,Not Found,thanx reminder swapnil
2889,Can you tell me a software for windows laptop ...,Not Found,Not Found,tell software windows laptop following feature...


In [23]:
#Splitting the data in test and train with ratio of 10% and 90%
t_train , t_test , s_train , s_test = train_test_split(bow_train ,train_data['label_found'] , test_size = 0.10 , random_state=101)


# Logistic regression Machine Learning


Logistic regression is a supervised learning classification algorithm used to predict the probability of a target variable. The nature of target or dependent variable is dichotomous, which means there would be only two possible classes. ... Mathematically, a logistic regression model predicts P(Y=1) as a function of X.


In [24]:
#Applying the Logistic regrssion
logreg = LogisticRegression()
logreg = logreg.fit(t_train, s_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [25]:
#Print Accuracy
print(accuracy_score(logreg.predict(t_test),s_test))

0.8116089613034623


# Random Forest Classifier

Random forests or random decision forests are an ensemble learning method for classification, regression and other tasks that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes or mean prediction of the individual trees.





In [26]:
#Applying Random Forest Classifier
rfc = RandomForestClassifier()
rfc.fit(t_train , s_train)
print(accuracy_score(rfc.predict(t_test),s_test))

0.7352342158859471


# Decision Tree Classifier


A decision tree classifier is a tree in which internal nodes are labeled by features. ... The classifier categorizes an object xi by recursively testing for the weights that the features labeling the internal nodes have in vector xi, until a leaf node is reached. The label of this node is then assigned to xi.



In [27]:
#Applying Decision Tree Classifier
dtc = DecisionTreeClassifier()
dtc.fit(t_train , s_train)
print(accuracy_score(s_test, dtc.predict(t_test)))

0.7077393075356415


In [28]:
#Applying Logistic Regression
logreg = LogisticRegression()
logreg = logreg.fit(bow_train ,train_data['label_found'])
pred = logreg.predict(bow_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [29]:
#Saving Model
filename = 'fmodel.sav'
joblib.dump(logreg, filename)

loaded_model = joblib.load(filename)

In [30]:
#Output Data in TSV format
output = pd.DataFrame( data={ "label_found":pred} )
output.to_csv( "result.tsv", index=False, quoting=3 )
output.sample(10)

Unnamed: 0,label_found
845,Found
779,Found
41,Found
769,Found
418,Found
597,Found
833,Not Found
361,Found
557,Found
894,Not Found
