In [13]:
# for Python 2: use print only as a function
# from __future__ import print_function

In [14]:
import ast

import pandas as pd
import numpy as np
from collections import defaultdict
import collections
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.preprocessing import MultiLabelBinarizer 
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.multioutput import MultiOutputClassifier  
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics


In [15]:
path = './datascience_datasets_for_doc_similarity.csv'

d_set = pd.read_csv(path, header=None, names=['tag', 'title', 'answers'])
d_set.drop(['answers'], axis=1, inplace=True)

d_set.head()


Unnamed: 0,tag,title
0,"['machine-learning', 'classification', 'evalua...",How to improve an existing (trained) classifier?
1,"['machine-learning', 'r', 'logistic-regression...","Random Forest, Type - Regression, Calculation ..."
2,['visualization'],How to analyze which site has most numbers
3,['bigdata'],Privacy through fake data?
4,"['r', 'data-wrangling']",When to choose character instead of factor in R?


In [16]:
y_labels = []                                       # contains values of d_set['tag']
for i in d_set['tag']:  
    y_labels.append(ast.literal_eval(i))            # to remove unicodeed string 
y_labels = [j for i in y_labels for j in i] 
y_labels = list(set(y_labels))

# for i in range(len(y_labels)):
#     if (y_labels[i].find('-') > -1):
#         y_labels[i] = y_labels[i].replace('-','')

In [17]:
import re
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer


class lemmatokenizer(object):
    
    def __init__(self):
        self.stemmer = SnowballStemmer('english')
        self.token_pattern = r"(?u)\b\w\w+\b"       
#         self.wnl = WordNetLemmatizer()

    def __call__(self,doc):                            # here, doc is one string sentence
        token_pattern = re.compile(self.token_pattern)
        return [self.stemmer.stem(t) for t in token_pattern.findall(doc)]
#         return lambda doc: token_pattern.findall(doc) 
#         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


In [18]:
vect_title = CountVectorizer(max_df=0.5,
                             min_df=5,
                             stop_words='english',
                             tokenizer=lemmatokenizer(),
                             ngram_range=(1,3))

tfidf_vect_title = TfidfVectorizer(smooth_idf=False,
                                   max_df=0.5,
                                   min_df=5,
                                   stop_words='english',
                                   tokenizer=lemmatokenizer(),
                                   ngram_range=(1,3))


# To make it clear what actually we are doing...

### tag                           
['r', 'machine-learning', 'ai'] 
   
### Labelencoder() 
[32, 324, 17]              
   
### MultiLabelBinarizer()
[0,0,0,0,........1,0,0,1,1,0,...]


In [19]:
le = preprocessing.LabelEncoder()  
le.fit(y_labels) 
d_set['label_num'] = pd.Series([le.transform(ast.literal_eval(i)) for i in d_set['tag']])
d_set.head()


Unnamed: 0,tag,title,label_num
0,"['machine-learning', 'classification', 'evalua...",How to improve an existing (trained) classifier?,"[119, 29, 65]"
1,"['machine-learning', 'r', 'logistic-regression...","Random Forest, Type - Regression, Calculation ...","[119, 170, 117, 171]"
2,['visualization'],How to analyze which site has most numbers,[225]
3,['bigdata'],Privacy through fake data?,[21]
4,"['r', 'data-wrangling']",When to choose character instead of factor in R?,"[170, 48]"


In [20]:
new_y_labels = d_set['label_num'].values.tolist()

# print (new_y_labels)


In [21]:
mlb = MultiLabelBinarizer() 
mlb.fit(new_y_labels)

# mlb.transform(new_y_labels).shape


MultiLabelBinarizer(classes=None, sparse_output=False)

In [22]:
y_tag_dtm = mlb.transform(new_y_labels) 

# print (type(y_tag_dtm))
# y_tag_dtm = pd.Series(y_tag_dtm) 

y_tag_dtm.shape


(1223, 231)

In [23]:
X_labels = d_set['title'].values.tolist()

# print (X_labels)


In [24]:
vect_title.fit(X_labels)
X_title_dtm = vect_title.transform(X_labels)

# vect_title.get_feature_names()
# vect_title.get_params
# print (X_title_dtm.toarray())

X_title_dtm


<1223x339 sparse matrix of type '<class 'numpy.int64'>'
	with 4929 stored elements in Compressed Sparse Row format>

# Feature Transformation
## implementation PCA : 

In [25]:
from sklearn.decomposition import PCA


pca = PCA(n_components=100).fit(X_title_dtm.toarray())
pca_samples = pca.transform(X_title_dtm.toarray())

pca_df = pd.DataFrame(np.round(pca_samples,4))

pca_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.1639,0.0399,-0.1656,-0.0699,0.0036,-0.1085,-0.0659,0.0288,-0.0431,0.2704,...,-0.0156,-0.0488,0.0473,-0.1021,0.0293,0.0226,-0.024,-0.0023,0.0615,0.0144
1,-0.2537,0.0441,-0.2416,-0.1782,-0.2837,0.5325,0.5208,0.1765,-0.1154,-0.0704,...,-0.4197,-0.0383,0.1125,0.0897,-0.2468,0.2057,0.1474,-0.4358,0.0248,0.1004
2,-0.1695,0.0244,-0.1774,-0.0856,-0.0291,-0.1381,-0.0116,0.0189,0.0609,0.0381,...,0.0456,0.06,-0.0011,0.0683,0.0442,-0.005,0.0543,0.0719,-0.0203,0.0814
3,0.7518,0.0536,-0.0251,-0.0947,-0.0301,-0.0468,-0.0012,-0.0295,-0.0005,-0.0187,...,-0.0065,-0.0095,-0.0192,-0.0069,-0.0045,-0.0257,-0.0137,-0.0009,0.0067,0.0116
4,-0.1978,0.0242,-0.1515,-0.1133,-0.0075,-0.1201,-0.0386,-0.0495,-0.0443,0.001,...,-0.1913,-0.0783,-0.45,-0.1523,0.2227,0.0237,0.073,-0.1094,0.1722,-0.2333


In [26]:
new_df = pd.DataFrame(X_title_dtm.toarray(),columns=vect_title.get_feature_names())

# new_df.head()
# new_df.ix[1].to_dict().values()

new_df.shape


(1223, 339)

In [28]:
d = collections.Counter(vect_title.get_feature_names())
# print (d['ai']) 

new_df['target_list'] = [i for i in y_tag_dtm] 


In [29]:
# new_df.columns[:100]  
# new_df.ix[0]

# new_df.head() 


In [30]:
tfidf_vect_title.fit(X_labels)
X_title_dtm_tfidf = tfidf_vect_title.transform(X_labels)

X_title_dtm_tfidf


<1223x339 sparse matrix of type '<class 'numpy.float64'>'
	with 4929 stored elements in Compressed Sparse Row format>

In [31]:
new_df_of_tfidf = pd.DataFrame(X_title_dtm_tfidf.toarray(),
                               columns=tfidf_vect_title.get_feature_names()) 

new_df_of_tfidf['target_list'] = [i for i in y_tag_dtm] 

y = new_df_of_tfidf['target_list'] 
X = new_df_of_tfidf.drop('target_list',axis=1) 


In [32]:
# y = new_df['target_list']
# X = new_df.drop('target_list',axis=1)

# # X = X.ix[:]                          # it will return each feature row wise to X
# # X = X.values

# print (type(X))


In [33]:
X = np.array(X.values.tolist())       # it will convert list to numpy ndarray
y = np.array(y.values.tolist())

pca_X = PCA(n_components=200).fit_transform(X)  
pca_X = np.round(pca_X,4)

pca_y = PCA(n_components=50).fit_transform(y)  
pca_y = np.round(pca_y,4)

print (pca_y) 


[[ 8.366e-01 -2.590e-02  4.576e-01 ...  2.100e-02 -4.410e-02  1.900e-03]
 [ 5.722e-01 -4.540e-01  2.170e-01 ... -3.650e-02 -5.610e-02  9.400e-03]
 [-3.276e-01 -8.170e-02 -7.700e-02 ...  5.100e-03 -3.100e-03  1.450e-02]
 ...
 [-2.637e-01 -1.401e-01 -8.150e-02 ... -3.193e-01 -2.057e-01 -4.500e-03]
 [-2.342e-01 -4.827e-01  7.541e-01 ... -7.000e-04  3.800e-03  5.900e-03]
 [-4.055e-01 -4.964e-01  1.827e-01 ... -1.800e-03 -3.100e-03  2.500e-03]]


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)   

# X_train, X_test, y_train, y_test = train_test_split(pca_X,
#                                                     pca_y,
#                                                     test_size=0.2,
#                                                     random_state=1)   


In [35]:
# clf = Pipeline([('classifier',
#                  OneVsRestClassifier(SVC(probability=True,
#                                          random_state=0)))])  

knn_clf = KNeighborsClassifier(n_neighbors=5)
# mnb_clf = MultinomialNB()                          # not working for MultiLabelinput

# time_pass_y = np.random.randint(2,size=(2838,1))   # produce ndarray of size 2838 X 1

knn_clf.fit(X_train, y_train)
# mnb_clf.fit(X_train, y_train) 

knn_pred = knn_clf.predict(X_test)  
# mnb_pred = mnb_clf.predict(X_test)


In [36]:
svc_clf = OneVsRestClassifier(SVC(probability=True,random_state=0))
svc_clf.fit(X_train, y_train)
svc_pred = svc_clf.predict(X_test)  


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


In [37]:
knn_clf.score(X_test, y_test) 

0.024489795918367346

In [39]:
# """ 
# it will give 0.0, since it's matching with the exact no. of 
# target labels and not giving credit for partial correct 
# prediction of labels. 
# """

# svc_clf.score(X_test, y_test)  


In [40]:
knn_report = metrics.classification_report(y_test[:], knn_pred[:]) 

knn_f1_score = metrics.f1_score(y_test[:], knn_pred[:], average='samples') 

knn_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, 
                                                                      knn_pred, 
                                                                      average='samples')

knn_avg_precision_score = metrics.average_precision_score(y_test, 
                                                          knn_pred, 
                                                          average='samples')

knn_roc_auc_score = metrics.roc_auc_score(y_test, knn_pred, average='samples')

# the below line, throws error - mnb_clf can't work on multilabel O/P
# mnb_report = metrics.classification_report(y_test[:100], mnb_pred[:100])  


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [41]:
svc_report = metrics.classification_report(y_test[:], svc_pred[:])

svc_f1_score = metrics.f1_score(y_test[:], svc_pred[:], average='samples') 

svc_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, 
                                                                      svc_pred, 
                                                                      average='samples')  

svc_avg_precision_score = metrics.average_precision_score(y_test, 
                                                          svc_pred, 
                                                          average='samples')

svc_roc_auc_score = metrics.roc_auc_score(y_test, svc_pred, average='samples')


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [39]:
# """  beacuse it will also give 0.0 for the same reason discussed above. """

# I think it's (below code) same as calculating hamming_score
# metrics.accuracy_score(y_true=y_test[:], y_pred=svc_pred[:])   


In [42]:
# print("For svc_clf (LinearSVC) : ")
# print("precision, recall, fbeta_score, support : ", svc_precision_recall_fscore)
# print("f1_score : ", svc_f1_score)

print("avg. precision_score : ", svc_avg_precision_score)
print("roc_auc_score : ", svc_roc_auc_score)


avg. precision_score :  0.0108843537414966
roc_auc_score :  0.5


In [43]:
# I think it's same as calculating hamming_score
metrics.accuracy_score(y_true=y_test[:], y_pred=knn_pred[:])          


0.024489795918367346

In [44]:
# print (knn_report)                                   # its type is str

print("For knn_clf (KNearestNeighbours) : ")
print("precision, recall, fbeta_score, support : ", knn_precision_recall_fscore)
print("f1_score : ", knn_f1_score)
print("avg. precision_score : ", knn_avg_precision_score)
print("roc_auc_score : ", knn_roc_auc_score)


For knn_clf (KNearestNeighbours) : 
precision, recall, fbeta_score, support :  (0.22176870748299318, 0.12523809523809523, 0.1462779397473275, None)
f1_score :  0.1462779397473275
avg. precision_score :  0.117744058662426
roc_auc_score :  0.5620747887469787


In [45]:
test = ["why is overfitting bad in machine learning ?"]
# test = ["what is lstm ?"] 

# test_dtm = vect_title.transform(test)              # without tfidf
test_dtm = tfidf_vect_title.transform(test)          # with tfidf

# print (test_dtm.toarray()[0])
status = False
for i in test_dtm.toarray()[0]:
    if (i!=0):
        status = True
        break

ans = knn_clf.predict(test_dtm.toarray())
ans = mlb.inverse_transform(ans)

if (len(ans[0])==0 or status==False):
    print ("sorry, we can't predict your category!!!")
else:
    ans = le.inverse_transform(ans)
    print (ans)
    
    
# mlb.transform([[224,0,100]]) 
# ans 
# test_dtm.toarray()


[['machine-learning']]


  if diff:


In [46]:
forest = RandomForestClassifier(n_estimators=10, random_state=0)
rf_clf = MultiOutputClassifier(forest, n_jobs=-1)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)

rf_clf 


MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
           n_jobs=-1)

In [47]:
# I think it's same as calculating hamming_score
metrics.accuracy_score(y_true=y_test[:100], y_pred=rf_pred[:100])          

rf_clf.score(X_test, y_test)

# no such function in MultiOutputClassifier(), maybe version issue with scikit-learn
# rf_clf.predict_log_proba(X_test)      


0.04081632653061224

In [48]:
# print (knn_clf.predict_proba(X_train))                                         

# below code throwing error
# print (rf_clf.predict_proba(X_train[:5])) 


In [49]:
rf_report = metrics.classification_report(y_test[:], rf_pred[:])

rf_f1_score = metrics.f1_score(y_test, rf_pred, average='samples')

rf_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, 
                                                                     rf_pred, 
                                                                     average='samples') 

rf_avg_precision_score = metrics.average_precision_score(y_test, 
                                                         rf_pred, 
                                                         average='samples')

rf_roc_auc_score = metrics.roc_auc_score(y_test, rf_pred, average='samples') 


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [50]:
# print (rf_report) 

print("For rf_clf (RandomForest) : ")
print("precision, recall, fbeta_score, support : ", rf_precision_recall_fscore)
print("f1_score : ", rf_f1_score)
print("avg. precision_score : ", rf_avg_precision_score)
print("roc_auc_score : ", rf_roc_auc_score)


For rf_clf (RandomForest) : 
precision, recall, fbeta_score, support :  (0.27925170068027205, 0.1814965986394558, 0.2010689990281827, None)
f1_score :  0.2010689990281827
avg. precision_score :  0.1536765615337044
roc_auc_score :  0.5897131745398898


In [51]:
test = ["what is lstm ?"] 

# test_dtm = vect_title.transform(test)          # without tfidf
test_dtm = tfidf_vect_title.transform(test)      # with tfidf

status = False
for i in test_dtm.toarray()[0]:
    if (i!=0):
        status = True
        break

ans = rf_clf.predict(test_dtm.toarray())
ans = mlb.inverse_transform(ans)
if (len(ans[0])==0 or status==False):
    print ("sorry, we can't predict your category!!!")
else:
    ans = le.inverse_transform(ans)
    print (ans)
    
# mlb.transform([[224,0,100]]) 
# ans 
# test_dtm.toarray()


sorry, we can't predict your category!!!


## picking (saving the ML model)

In [52]:
from sklearn.externals import joblib
joblib.dump(rf_clf, 'datascience_classifier.pkl')
# new_clf = joblib.load('classifier.pkl')


['datascience_classifier.pkl']

In [53]:
new_pkl_clf = joblib.load('datascience_classifier.pkl')

new_pkl_clf


MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
           n_jobs=-1)

In [54]:
test = ["why is overfitting bad in machine learning ?"] 

# test_dtm = vect_title.transform(test)              # without tfidf
test_dtm = tfidf_vect_title.transform(test)          # with tfidf       

status = False
for i in test_dtm.toarray()[0]:
    if (i!=0):
        status = True
        break
        
ans = new_pkl_clf.predict(test_dtm.toarray())
ans = mlb.inverse_transform(ans)
if (len(ans[0])==0 or status==False):
    print ([["sorry, we can't predict your category!!!"]]) 
else:
    ans = le.inverse_transform(ans)
    print (ans)
    

[['machine-learning' 'predictive-modeling']]


  if diff:
