In [88]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [89]:
# Reading data
train_data = pd.read_csv('../data/training_data.tsv',header=0,delimiter="\t" ,quoting=3)

In [90]:
# Function to define if there is reminder 
def found(x):
    if (x[0] == "Not Found"):
        return "Not Found"
    else:
        return "Found"

In [91]:
#Applying the found function
train_data['label_found'] = train_data[['label']].apply( found , axis = 1)

In [92]:
#This Function help us to remove the Digits and Emojis from the data
def clean_txt(texts):
    letters = re.sub("[^a-zA-Z]",' ', str(texts))
    lower_case = letters.lower()
    words = lower_case.split()
    stopword = stopwords.words('english')
    meaning_words = [w for w in words if not w in stopword]
    return (" ".join(meaning_words))

In [93]:
#remove the Digits and Emojis from the data
train_data['sent_clean'] = [clean_txt(review) for review in train_data["sent"].values]

In [94]:
#Loading test data as test_data
test_data = pd.read_csv('../data/eval_data.txt',header=0,delimiter="\t" ,quoting=3)

In [95]:
#remove the Digits and Emojis from the data
test_data['sent_clean'] = [clean_txt(review) for review in test_data["sent"].values]

In [96]:
#Show the Data
test_data.sample(10)

Unnamed: 0,sent,sent_clean
401,I can remember now by my own,remember
893,This was such a small alarm,small alarm
391,"Call sanjay, rajesh today by 4 pm",call sanjay rajesh today pm
833,16/1/2016 please remind about court in the mor...,please remind court morning
240,Reminder on Monday at 11 am to go to guru Ji,reminder monday go guru ji
283,Remind me to tell him abt job preferences at P...,remind tell abt job preferences pune get advice
469,Set reminder for insurance payment on 15th Feb...,set reminder insurance payment th february
87,Setup reminder,setup reminder
713,Add some more reminders,add reminders
857,Can u Mee what was today's reminders of me,u mee today reminders


In [97]:
#Convert a collection of text documents to a matrix of token counts
#Bow train
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, max_features = 12000) 
bow_train = (vectorizer.fit_transform(train_data['sent_clean'])).toarray()
bow_test = (vectorizer.transform(test_data['sent_clean'])).toarray()

In [98]:
#Showing Data
train_data.sample(10)

Unnamed: 0,sent,label,label_found,sent_clean
4105,Will you remind me about stock market downs or...,stock market downs or up,Found,remind stock market downs told
3940,Reminder to recharge,recharge,Found,reminder recharge
5759,Remind me to watch,watch,Found,remind watch
3895,TNx for reminder me,Not Found,Not Found,tnx reminder
6496,Yes plz remind me everyday,Not Found,Not Found,yes plz remind everyday
5393,Set reminder on 13/04/2017 to 27/04/2017 meeti...,meeting nishi school instructions,Found,set reminder meeting nishi school instructions
2475,End my all reminder,Not Found,Not Found,end reminder
3161,Thank u fr d reminder,Not Found,Not Found,thank u fr reminder
3012,"Meeting Reminder Date: November 23, 2016 Time:...",Not Found,Not Found,meeting reminder date november time meeting c b
5893,Remind me tomorrow Morning to take tax pm,take tax pm,Found,remind tomorrow morning take tax pm


In [99]:
#Splitting the data in test and train with ratio of 10% and 90%
t_train , t_test , s_train , s_test = train_test_split(bow_train ,train_data['label_found'] , test_size = 0.10 , random_state=101)


# Logistic regression Machine Learning


Logistic regression is a supervised learning classification algorithm used to predict the probability of a target variable. The nature of target or dependent variable is dichotomous, which means there would be only two possible classes. ... Mathematically, a logistic regression model predicts P(Y=1) as a function of X.


In [100]:
#Applying the Logistic regrssion
logreg = LogisticRegression()
logreg = logreg.fit(t_train, s_train)



In [101]:
#Print Accuracy
print(accuracy_score(logreg.predict(t_test),s_test))

0.8105906313645621


# Random Forest Classifier

Random forests or random decision forests are an ensemble learning method for classification, regression and other tasks that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes or mean prediction of the individual trees.





In [102]:
#Applying Random Forest Classifier
rfc = RandomForestClassifier()
rfc.fit(t_train , s_train)
print(accuracy_score(rfc.predict(t_test),s_test))



0.7321792260692465


# Decision Tree Classifier


A decision tree classifier is a tree in which internal nodes are labeled by features. ... The classifier categorizes an object xi by recursively testing for the weights that the features labeling the internal nodes have in vector xi, until a leaf node is reached. The label of this node is then assigned to xi.



In [103]:
#Applying Decision Tree Classifier
dtc = DecisionTreeClassifier()
dtc.fit(t_train , s_train)
print(accuracy_score(s_test, dtc.predict(t_test)))

0.7138492871690427


In [104]:
#Applying Logistic Regression
logreg = LogisticRegression()
logreg = logreg.fit(bow_train ,train_data['label_found'])
pred = logreg.predict(bow_test)



In [106]:
#Output Data in TSV format
output = pd.DataFrame( data={ "label_found":pred} )
output.to_csv( "result.tsv", index=False, quoting=3 )
output.sample(10)

Unnamed: 0,label_found
314,Not Found
41,Found
649,Found
196,Not Found
580,Not Found
939,Found
720,Not Found
497,Not Found
478,Not Found
906,Found
