## Indomain Evaluation

In [8]:
import pandas as pd
import fasttext as ft
pd.set_option('display.max_colwidth', -1)   

### Hotel

In [9]:
hotel = pd.read_csv('data/hotel/hotel.csv')
hotel.head()

Unnamed: 0,text,class
0,This hotel was very modern and sleek.,0
1,"Beautiful, well-laid out, albeiit small rooms.",0
2,"Fantastic breakfast with an incredible selection of meats, cheeses, yogurts, breads etc. And, as a non-German speaker,",0
3,the staff were uber-helpful.,0
4,Great location in front of a u-bahn stop.,0


In [10]:
san = hotel['text'].copy()
for i in range(len(san)):
    san[i] = '__label__'+str(hotel['class'][i])+' '+san[i]
hotel['text'] = san
hotel.head()

Unnamed: 0,text,class
0,__label__0 This hotel was very modern and sleek.,0
1,"__label__0 Beautiful, well-laid out, albeiit small rooms.",0
2,"__label__0 Fantastic breakfast with an incredible selection of meats, cheeses, yogurts, breads etc. And, as a non-German speaker,",0
3,__label__0 the staff were uber-helpful.,0
4,__label__0 Great location in front of a u-bahn stop.,0


In [11]:
hotel = hotel.sample(frac = 1).reset_index(drop = True)
print hotel.shape

(7534, 2)


In [12]:
hotel_train = hotel[:-750]
hotel_test = hotel[-750:]
hotel_test = hotel_test.reset_index(drop = True)

In [13]:
hotel_test.shape

(750, 2)

In [16]:
fw = open('data/hotel/hotel_train.txt','w')
for i in range(len(hotel_train)):
    fw.write(hotel_train['text'][i])
    fw.write('\n')
fw.close()

In [17]:
fw = open('data/hotel/hotel_test.txt','w')
for i in range(len(hotel_test)):
    fw.write(hotel_test['text'][i])
    fw.write('/n')
fw.close()

In [105]:
hotel_clf = ft.supervised('data/hotel/hotel_train.txt','data/hotel/hotel')

In [106]:
hotel_result = hotel_clf.test('data/hotel/hotel_test.txt')
print hotel_result.precision
print hotel_result.recall

1.0
1.0


In [107]:
hotel_test_texts = []
for i in range(len(hotel_test)):
    hotel_test_texts.append(hotel_test['text'][i][11:])

In [108]:
pred = hotel_clf.predict(hotel_test_texts)
hotel_test_pred = []
for i in range(len(pred)):
    hotel_test_pred.append(int(pred[i][0]))
hotel_test_labels = list(hotel_test['class'])

In [109]:
# precision recall and F1 score.
from sklearn.metrics import classification_report
print(classification_report(hotel_test_labels, hotel_test_pred, target_names=['Not Sug','Sug']))

             precision    recall  f1-score   support

    Not Sug       0.96      0.99      0.97       711
        Sug       0.56      0.26      0.35        39

avg / total       0.94      0.95      0.94       750



In [110]:
from sklearn.metrics import accuracy_score
print accuracy_score(hotel_test_labels,hotel_test_pred)

0.950666666667


## Loading the Trained Model

In [111]:
clf = ft.load_model('hotel.bin', label_prefix = '__label__')

In [112]:
pred = clf.predict(hotel_test_texts)
hotel_test_pred = []
for i in range(len(pred)):
    hotel_test_pred.append(int(pred[i][0]))

In [113]:
# precision recall and F1 score.
from sklearn.metrics import classification_report
print(classification_report(hotel_test_labels, hotel_test_pred, target_names=['Not Sug','Sug']))

             precision    recall  f1-score   support

    Not Sug       0.97      0.99      0.98       711
        Sug       0.70      0.36      0.47        39

avg / total       0.95      0.96      0.95       750



## Elec

In [44]:
elec = pd.read_csv('data/elec/elec.csv')

san = elec['text'].copy()
for i in range(len(san)):
    san[i] = '__label__'+str(elec['class'][i])+' '+san[i]
elec['text'] = san
elec.head()

Unnamed: 0,text,class
0,__label__0 does your apex dvd player only play dvd audio without video?,0
1,__label__0 or does it play audio and video but scrolling in black and white ?,0
2,"__label__1 before you try to return the player or waste hours calling apex tech support , or run the player over with your car, try these simple troubleshooting ideas first .",1
3,__label__0 hopefully you still have the remote control .,0
4,"__label__0 if you tossed it out of the window , you need to fetch it .",0


In [159]:
print elec.shape

(3782, 2)


In [160]:
elec = elec.sample(frac = 1).reset_index(drop = True)
#print elec.shape

elec_train = elec[:-350]
elec_test = elec[-350:]
elec_test = elec_test.reset_index(drop = True)

In [161]:
fw = open('data/elec/elec_train.txt','w')
for i in range(len(elec_train)):
    fw.write(elec_train['text'][i])
    fw.write('\n')
fw.close()

fw = open('data/elec/elec_test.txt','w')
for i in range(len(elec_test)):
    fw.write(elec_test['text'][i])
    fw.write('/n')
fw.close()

In [167]:
elec_clf = ft.supervised('data/elec/elec_train.txt','data/elec/elec')

In [168]:
elec_result = elec_clf.test('data/elec/elec_test.txt')
print elec_result.precision
print elec_result.recall

1.0
1.0


In [169]:
elec_test_texts = []
for i in range(len(elec_test)):
    elec_test_texts.append(elec_test['text'][i][11:])

pred = elec_clf.predict(elec_test_texts)
elec_test_pred = []
for i in range(len(pred)):
    elec_test_pred.append(int(pred[i][0]))
elec_test_labels = list(elec_test['class'])

In [170]:
# precision recall and F1 score.
from sklearn.metrics import classification_report
print(classification_report(elec_test_labels, elec_test_pred, target_names=['Not Sug','Sug']))

             precision    recall  f1-score   support

    Not Sug       0.96      0.99      0.98       330
        Sug       0.70      0.35      0.47        20

avg / total       0.95      0.95      0.95       350



In [171]:
from sklearn.metrics import accuracy_score
print accuracy_score(elec_test_labels,elec_test_pred)

0.954285714286


## Travel

In [180]:
travel = pd.read_table('data/travel/travel.txt', names = ['text','class'])
travel = travel.reset_index(drop = True)
travel.head()

Unnamed: 0,text,class
0,You can find useful tips and information on Insight's website .,1
1,Go to the TIPS tab on the homepage and find further information on the following :,0
2,"Frequently Asked Questions : Get answers to the most frequently asked questions about Insight Vacations tours ,",0
3,Travel Information : Travel information on all countries that Insight Vacations tours .,0
4,Your most common questions will be answered within these specially prepared downloads .,0


In [181]:
san = travel['text'].copy()
for i in range(len(san)):
    san[i] = '__label__'+str(travel['class'][i])+' '+san[i]
travel['text'] = san

travel = travel.sample(frac = 1).reset_index(drop = True)
print travel.shape

(5183, 2)


In [182]:
travel_train = travel[:-500]
travel_test = travel[-500:]
travel_test = travel_test.reset_index(drop = True)

fw = open('data/travel/travel_train.txt','w')
for i in range(len(travel_train)):
    fw.write(travel_train['text'][i])
    fw.write('\n')
fw.close()

fw = open('data/travel/travel_test.txt','w')
for i in range(len(travel_test)):
    fw.write(travel_test['text'][i])
    fw.write('/n')
fw.close()

In [235]:
travel_clf = ft.supervised('data/travel/travel_train.txt','data/travel/travel')
travel_result = travel_clf.test('data/travel/travel_test.txt')

print travel_result.precision
print travel_result.recall

1.0
1.0


In [236]:
travel_test_texts = []
for i in range(len(travel_test)):
    travel_test_texts.append(travel_test['text'][i][11:])

pred = travel_clf.predict(travel_test_texts)
travel_test_pred = []
for i in range(len(pred)):
    travel_test_pred.append(int(pred[i][0]))
travel_test_labels = list(travel_test['class'])

In [237]:
# precision recall and F1 score.
from sklearn.metrics import classification_report
print(classification_report(travel_test_labels, travel_test_pred, target_names=['Not Sug','Sug']))

             precision    recall  f1-score   support

    Not Sug       0.83      0.95      0.88       377
        Sug       0.71      0.39      0.50       123

avg / total       0.80      0.81      0.79       500



In [238]:
from sklearn.metrics import accuracy_score
print accuracy_score(travel_test_labels,travel_test_pred)

0.81


## Sugg User Voice

In [241]:
suguservoice = pd.read_csv('data/suguservoice/suguservoice.csv')
suguservoice.head()

Unnamed: 0,text,class
0,"Please enable removing language code from the Dev Center ""language history"" For example if you ever selected ""ru"" and ""ru-ru"" laguages and you published this xap to the Store then it causes Tile localization to show the en-us(default) tile localization which is bad.",1.0
1,"Note: in your .csproj file, there is a SupportedCultures entry like this: <SupportedCultures>de-DE;ru;ru-RU </SupportedCultures> When I removed the ""ru"" language code and published my new xap version, the old xap version still remains in the Store with ""Replaced and unpublished"".",0.0
2,Wich means the new version not fully replaced the old version and this causes me very serious problems: 1.,0.0
3,Some of my users will still receive the old xap version of my app.,0.0
4,The store randomly gives the old xap or the new xap version of my app.,0.0


In [242]:
suguservoice['class'] = suguservoice['class'].fillna(0)
suguservoice['class'] = suguservoice['class'].astype('int64')
suguservoice.head()

Unnamed: 0,text,class
0,"Please enable removing language code from the Dev Center ""language history"" For example if you ever selected ""ru"" and ""ru-ru"" laguages and you published this xap to the Store then it causes Tile localization to show the en-us(default) tile localization which is bad.",1
1,"Note: in your .csproj file, there is a SupportedCultures entry like this: <SupportedCultures>de-DE;ru;ru-RU </SupportedCultures> When I removed the ""ru"" language code and published my new xap version, the old xap version still remains in the Store with ""Replaced and unpublished"".",0
2,Wich means the new version not fully replaced the old version and this causes me very serious problems: 1.,0
3,Some of my users will still receive the old xap version of my app.,0
4,The store randomly gives the old xap or the new xap version of my app.,0


In [246]:
type(str(suguservoice['class'][0]))

str

In [249]:
san[1]

'__label__0 Note: in your .csproj file, there is a SupportedCultures entry like this: <SupportedCultures>de-DE;ru;ru-RU </SupportedCultures> When I removed the "ru" language code and published my new xap version, the old xap version still remains in the Store with "Replaced and unpublished".'

In [251]:
type(suguservoice['text'][0])

str

In [252]:
san = suguservoice['text'].copy()
for i in range(len(san)):
    san[i] = '__label__' + str(suguservoice['class'][i]) + ' ' + str(san[i])
suguservoice['text'] = san

suguservoice = suguservoice.sample(frac = 1).reset_index(drop = True)
print suguservoice.shape

(5807, 2)


In [253]:
suguservoice_train = suguservoice[:-500]
suguservoice_test = suguservoice[-500:]
suguservoice_test = suguservoice_test.reset_index(drop = True)

fw = open('data/suguservoice/suguservoice_train.txt','w')
for i in range(len(suguservoice_train)):
    fw.write(suguservoice_train['text'][i])
    fw.write('\n')
fw.close()

fw = open('data/suguservoice/suguservoice_test.txt','w')
for i in range(len(suguservoice_test)):
    fw.write(suguservoice_test['text'][i])
    fw.write('/n')
fw.close()

In [285]:
suguservoice_clf = ft.supervised('data/suguservoice/suguservoice_train.txt','suguservoice')
suguservoice_result = suguservoice_clf.test('data/suguservoice/suguservoice_test.txt')

print suguservoice_result.precision
print suguservoice_result.recall

1.0
1.0


In [286]:
suguservoice_test_texts = []
for i in range(len(suguservoice_test)):
    suguservoice_test_texts.append(suguservoice_test['text'][i][11:])

pred = suguservoice_clf.predict(suguservoice_test_texts)
suguservoice_test_pred = []
for i in range(len(pred)):
    suguservoice_test_pred.append(int(pred[i][0]))
suguservoice_test_labels = list(suguservoice_test['class'])

In [288]:
# precision recall and F1 score.
from sklearn.metrics import classification_report
print(classification_report(suguservoice_test_labels, suguservoice_test_pred, target_names=['Not Sug','Sug']))

             precision    recall  f1-score   support

    Not Sug       0.87      0.95      0.91       383
        Sug       0.78      0.54      0.64       117

avg / total       0.85      0.86      0.85       500



In [289]:
from sklearn.metrics import accuracy_score
print accuracy_score(suguservoice_test_labels,suguservoice_test_pred)

0.856


## SugHashTag

In [304]:
sughash = pd.read_csv('data/sughash/sughash.csv')
sughash.head()

Unnamed: 0,id,text,expert label
0,6.9e+17,"Advice to consider: Hand Exercises for Knitters, Crocheters, and other Handcrafters with Col... https://t.co/pkV4zXUc3S via @YouTube",1
1,6.9e+17,R: did Minhyuk give you any advice\rHW: nope I prepared everything myself,0
2,6.9e+17,"RT https://t.co/Je3cRDneKu Start 'Em, Sit 'Em Week 7: Live Fantasy Football Mailbag and Lineup Advice _ʹ_ https://t.co/3RyRvBOOZX",0
3,6.9e+17,RT @kxngde123: Best advice for 2016 : LET THAT SHIT GO_�_�_��,0
4,6.9e+17,RT @Sam1963: It must be on the advice of state weatherman? Does New Jersey have one too? https://t.co/7KPztfTeQH,0


In [305]:
sughash = sughash.drop(['id'],axis = 1)
sughash.columns = ['text','class']
sughash.head()

Unnamed: 0,text,class
0,"Advice to consider: Hand Exercises for Knitters, Crocheters, and other Handcrafters with Col... https://t.co/pkV4zXUc3S via @YouTube",1
1,R: did Minhyuk give you any advice\rHW: nope I prepared everything myself,0
2,"RT https://t.co/Je3cRDneKu Start 'Em, Sit 'Em Week 7: Live Fantasy Football Mailbag and Lineup Advice _ʹ_ https://t.co/3RyRvBOOZX",0
3,RT @kxngde123: Best advice for 2016 : LET THAT SHIT GO_�_�_��,0
4,RT @Sam1963: It must be on the advice of state weatherman? Does New Jersey have one too? https://t.co/7KPztfTeQH,0


### Tweets Processor

In [306]:
import re
import codecs
import csv

#start process_tweet
def processTweet(tweet):
    tweet = tweet.lower()
    tweet = " " + tweet
    tweet = re.sub(r'[^\x00-\x7F]+','', tweet)
    #tweet = tweet.replace(" rt "," ")
    tweet = re.sub(' rt ','', tweet)
    tweet = re.sub('(\.)+','.', tweet)
    #tweet = re.sub('((www\.[^\s]+)|(https://[^\s]+) | (http://[^\s]+))','URL',tweet)
    tweet = re.sub('((www\.[^\s]+))','',tweet)
    tweet = re.sub('((http://[^\s]+))','',tweet)
    tweet = re.sub('((https://[^\s]+))','',tweet)
    tweet = re.sub('@[^\s]+','',tweet)
    tweet = re.sub('[\s]+', ' ', tweet)
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    tweet = re.sub('_','',tweet)
    tweet = re.sub('\$','',tweet)
    tweet = re.sub('%','',tweet)
    tweet = re.sub('^','',tweet)
    tweet = re.sub('&','',tweet)
    tweet = re.sub('\*','',tweet)
    tweet = re.sub('\(','',tweet)
    tweet = re.sub('\)','',tweet)
    tweet = re.sub('-','',tweet)
    tweet = re.sub('\+','',tweet)
    tweet = re.sub('=','',tweet)
    tweet = re.sub('"','',tweet)
    tweet = re.sub('~','',tweet)
    tweet = re.sub('`','',tweet)
    tweet = re.sub('!','',tweet)
    tweet = re.sub(':','',tweet)
    tweet = re.sub('^-?[0-9]+$','', tweet)
    tweet = tweet.strip('\'"')
    return tweet

#end

In [307]:
# example
print sughash['text'][0]
print processTweet(sughash['text'][0])

Advice to consider: Hand Exercises for Knitters, Crocheters, and other Handcrafters with Col... https://t.co/pkV4zXUc3S via @YouTube
 advice to consider hand exercises for knitters, crocheters, and other handcrafters with col. via 


In [308]:
san = sughash['text'].copy()
for i in range(len(san)):
    san[i] = '__label__' + str(sughash['class'][i]) + ' ' + str(processTweet(san[i]))
sughash['text'] = san

sughash = sughash.sample(frac = 1).reset_index(drop = True)
print sughash.shape

(4099, 2)


In [309]:
sughash.head()

Unnamed: 0,text,class
0,__label__1 helpful advice for lifeline customers when your telephone line stops working priority care servic,1
1,"__label__1 police warning stay with your car when you scrape off that ice as the cold weather has drawn in, west yorksh.",1
2,__label__0 im jealous of arkansas. i wish a winter storm would stretch across central texas,0
3,__label__0 while y'all praying for snow i'm asking god to block the devils work so he not hearing y'all snow request i put too m,0
4,__label__0 advice or sex tips??? i'll give you my personal hotline for advice. dm or mention me.,0


In [310]:
sughash_train = sughash[:-400]
sughash_test = sughash[-400:]
sughash_test = sughash_test.reset_index(drop = True)

fw = open('data/sughash/sughash_train.txt','w')
for i in range(len(sughash_train)):
    fw.write(sughash_train['text'][i])
    fw.write('\n')
fw.close()

fw = open('data/sughash/sughash_test.txt','w')
for i in range(len(sughash_test)):
    fw.write(sughash_test['text'][i])
    fw.write('/n')
fw.close()

In [322]:
sughash_clf = ft.supervised('data/sughash/sughash_train.txt','data/sughash/sughash')
sughash_result = sughash_clf.test('data/sughash/sughash_test.txt')

print sughash_result.precision
print sughash_result.recall

1.0
1.0


In [323]:
sughash_test_texts = []
for i in range(len(sughash_test)):
    sughash_test_texts.append(sughash_test['text'][i][11:])

pred = sughash_clf.predict(sughash_test_texts)
sughash_test_pred = []
for i in range(len(pred)):
    sughash_test_pred.append(int(pred[i][0]))
sughash_test_labels = list(sughash_test['class'])

In [324]:
# precision recall and F1 score.
from sklearn.metrics import classification_report
print(classification_report(sughash_test_labels, sughash_test_pred, target_names=['Not Sug','Sug']))

             precision    recall  f1-score   support

    Not Sug       0.86      0.88      0.87       282
        Sug       0.70      0.66      0.68       118

avg / total       0.81      0.82      0.82       400



In [325]:
from sklearn.metrics import accuracy_score
print accuracy_score(sughash_test_labels,sughash_test_pred)

0.8175


# All Files Concatination

In [366]:
import fileinput
filenames = ['data/hotel/hotel_test.txt','data/hotel/hotel_train.txt',
             'data/elec/elec_test.txt','data/elec/elec_train.txt',
             'data/travel/travel_test.txt','data/travel/travel_train.txt',
             'data/sughash/sughash_test.txt','data/sughash/sughash_train.txt',
             'data/suguservoice/suguservoice_test.txt','data/suguservoice/suguservoice_train.txt'
            ]

In [367]:
with open('trainer.txt', 'w') as fout:
    fin = fileinput.input(filenames)
    for line in fin:
        fout.write(line)
    fin.close()

In [370]:
trainer_clf = ft.supervised('trainer.txt', 'trainer')

In [371]:
final_df = hotel.copy()
final_df = final_df.append(elec,ignore_index = True)
final_df = final_df.append(travel,ignore_index = True)
final_df = final_df.append(suguservoice,ignore_index = True)
final_df = final_df.append(sughash,ignore_index = True)
final_df.head()

Unnamed: 0,text,class
0,__label__0 the rooms were fairly dirty.,0
1,"__label__0 The place was spotless, even with a convention going on.",0
2,"__label__0 Pool was warm, hot tub was non-existent it was covered up or under construction on what looked to be not touched in a long time.",0
3,"__label__0 We love the pink, the private beach, the whole opulent feel of the hotel.",0
4,"__label__1 Overall, if cost is a main concern, and you want stay in a basic room for $50 with decent service, in an acceptable neighborhood, then you apparently dont have enough $$$ to stay in a San Francisco hotel, try an upgrade to a local hostel, or stay home.",1


In [375]:
final_df = final_df.sample(frac = 1).reset_index(drop=True)
final_df.head()

Unnamed: 0,text,class
0,__label__0 people influence people. nothing influences people more than a recommendation from a trusted friend. mark zuckerberg,0
1,__label__0 when ur the selfless dependable friend everyone goes to for advice but when you need to talk no one listens to you htt,0
2,"__label__0 A part time job and cheap bed , where I can sleep lots , eat well , and lift properly again is my idea .",0
3,__label__0 dying cancer patient's last wish 'i want to see my horse',0
4,__label__0 and this is on a quite street just behind the KaDeWe.,0


In [376]:
final_df['class'].value_counts(normalize = True)

0    0.827949
1    0.172051
Name: class, dtype: float64

In [377]:
san = final_df['text'].copy()
for i in range(len(san)):
    san[i] = '__label__' + str(final_df['class'][i]) + ' ' + str(san[i])
final_df['text'] = san

final_df = final_df.sample(frac = 1).reset_index(drop = True)
print final_df.shape

(26405, 2)


In [379]:
final_df_train = final_df[:-2600]
final_df_test = final_df[-2600:]
final_df_test = final_df_test.reset_index(drop = True)

fw = open('final_df_train.txt','w')
for i in range(len(final_df_train)):
    fw.write(final_df_train['text'][i])
    fw.write('\n')
fw.close()

fw = open('final_df_test.txt','w')
for i in range(len(final_df_test)):
    fw.write(final_df_test['text'][i])
    fw.write('/n')
fw.close()

In [384]:
final_df_clf = ft.supervised('final_df_train.txt','final_df')

In [385]:
final_df_test_texts = []
for i in range(len(final_df_test)):
    final_df_test_texts.append(final_df_test['text'][i][11:])

pred = final_df_clf.predict(final_df_test_texts)
final_df_test_pred = []
for i in range(len(pred)):
    final_df_test_pred.append(int(pred[i][0]))
final_df_test_labels = list(final_df_test['class'])

In [386]:
# precision recall and F1 score.
from sklearn.metrics import classification_report
print(classification_report(final_df_test_labels, final_df_test_pred, target_names=['Not Sug','Sug']))

             precision    recall  f1-score   support

    Not Sug       0.90      0.95      0.92      2162
        Sug       0.65      0.47      0.55       438

avg / total       0.86      0.87      0.86      2600



In [387]:
from sklearn.metrics import accuracy_score
print accuracy_score(final_df_test_labels,final_df_test_pred)

0.868461538462
