In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import sqlite3
import re
from tqdm import tqdm
import seaborn as sns
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS


In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score
from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import LogisticRegression
from skmultilearn.adapt import MLkNN
from sklearn.metrics import f1_score, precision_score, recall_score, silhouette_score

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import ngrams

In [None]:
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = '#464646'
plt.rcParams['figure.figsize'] = 10, 7
plt.rcParams['text.color'] = '#666666'
plt.rcParams['axes.labelcolor'] = '#666666'
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['xtick.color'] = '#666666'
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.color'] = '#666666'
plt.rcParams['ytick.labelsize'] = 14

sns.color_palette('dark')
%matplotlib inline

In [None]:
train = pd.read_csv('../input/facebook-recruiting-iii-keyword-extraction/Train.zip')
train.head()

In [None]:
train.shape

In [None]:
train = train.iloc[10000:30000, :]

print("Shape of training dataframe after subsetting : ", train.shape)

In [None]:
train['Tag_count'] = train['Tags'].apply(lambda x: len(str(x).split()))

In [None]:
train.dropna()
train.shape

In [None]:
train.isnull().sum()

In [None]:
train = train[~train['Tags'].isnull()]
train.shape

In [None]:
fig = plt.figure(figsize=[10,7])
sns.countplot(train['Tag_count'])
plt.title('Distribution of tag count')
plt.ylabel('Frequency')
plt.xlabel('Tag count')
plt.show()

In [None]:
tag_vectorizer = CountVectorizer(tokenizer= lambda x: str(x).split())
tag_mat = tag_vectorizer.fit_transform(train['Tags'])

In [None]:
tag_names = tag_vectorizer.get_feature_names()
type(tag_names), len(tag_names)

In [None]:
tag_freq = tag_mat.sum(axis=0)
type(tag_freq), tag_freq.A1.shape

In [None]:
tag_freq_ser = pd.Series(tag_freq.A1, index=tag_names)
tag_freq_ser.sort_values(ascending=False, inplace=True)
tag_freq_ser.head()

In [None]:
fig = plt.figure(figsize=[10,7])
plt.plot(tag_freq_ser.values,
         c=sns.xkcd_rgb['greenish cyan'])
plt.title('Tag frequency distribution')
plt.ylabel('Frequency')
plt.xlabel('Tag ID')
plt.show()

In [None]:
fig = plt.figure(figsize=[10,7])
plt.plot(tag_freq_ser.iloc[:500].values,
         c=sns.xkcd_rgb['greenish cyan'])
plt.title('Tag frequency distribution of top 500 Tags')
plt.ylabel('Frequency')
plt.xlabel('Tag ID')
plt.show()

In [None]:
fig = plt.figure(figsize=[10,7])
plt.plot(tag_freq_ser.iloc[:100].values,
         c=sns.xkcd_rgb['greenish cyan'])
plt.title('Tag frequency distribution of top 100 Tags')
plt.ylabel('Frequency')
plt.xlabel('Tag ID')
plt.show()

In [None]:
wordcloud = WordCloud(background_color='blue',
                      max_words=250).generate_from_frequencies(tag_freq_ser)
fig = plt.figure(figsize=[15,15])
plt.title('WordCloud of Tags')
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

In [None]:
fig = plt.figure(figsize=[20,10])
sns.barplot(x=tag_freq_ser.iloc[:50].index,
            y=tag_freq_ser.iloc[:50].values,
           color=sns.xkcd_rgb['blue'])
plt.title('Frequency of top 50 Tags')
plt.xlabel('Tags')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()

**Data Preprocessing**

Cleaning text data

In [None]:
def clean_text(sentence):
    pattern = re.compile(r'[^a-z]+')
    sentence = sentence.lower()
    sentence = pattern.sub(' ', sentence).strip()
    
    word_list = word_tokenize(sentence)
    stopwords_list = set(stopwords.words('english'))
    word_list = [word for word in word_list if word not in stopwords_list]
    ps  = PorterStemmer()
    word_list = [ps.stem(word) for word in word_list]
    sentence = ' '.join(word_list)
    
    return sentence

tqdm.pandas()
train['Title'] = train['Title'].progress_apply(lambda x: clean_text(str(x)))

In [None]:
train.head()

In [None]:
def questions_covered(one_hot_tag, ntags):
    nq = one_hot_tag.shape[0]
    tag_sum = one_hot_tag.sum(axis=0).tolist()[0]
    tag_sum_sorted = sorted(range(len(tag_sum)),
                            key=lambda x: tag_sum[x],
                            reverse=True)
    one_hot_topn_tag = one_hot_tag[:, tag_sum_sorted[:ntags]]
    tags_per_question = one_hot_topn_tag.sum(axis=1)
    q_with_0_tags = np.count_nonzero(tags_per_question == 0)
    
    return np.round((nq - q_with_0_tags)/nq*100, 2)

def questions_covered_list(one_hot_tag, window):
    ntags = one_hot_tag.shape[1]
    qid_list = np.arange(100, ntags, window)
    ques_covered_list = []
    for idx in range(100, ntags, window):
        ques_covered_list.append(questions_covered(one_hot_tag, idx))
        
    return qid_list, ques_covered_list


def topn_tags(one_hot_tag, ntags):
    tag_sum = one_hot_tag.sum(axis=0).tolist()[0]
    tag_sum_sorted = sorted(range(len(tag_sum)),
                            key=lambda x: tag_sum[x],
                            reverse=True)
    one_hot_topn_tag = one_hot_tag[:, tag_sum_sorted[:ntags]]
    return one_hot_topn_tag

In [None]:
tag_vectorizer = CountVectorizer(tokenizer= lambda x: str(x).split(), binary=True)
y_multinomial = tag_vectorizer.fit_transform(train['Tags'])

In [None]:
x, y = questions_covered_list(y_multinomial, 100)
fig = plt.figure(figsize=[10,7])
plt.title('Questions covered Vs Numbre of Tags')
plt.ylabel('Percentage of Questions covered')
plt.xlabel('Number of Tags')
plt.plot(x,y, c=sns.xkcd_rgb['greenish cyan'])
plt.show()

In [None]:
print('#Tags\t%Ques')
for idx in range(500, 7500, 500):
    print(idx, '\t', y[int(idx/100)])

In [None]:
y_multinomial = topn_tags(y_multinomial, 100)

In [None]:
non_zero_idx = y_multinomial.sum(axis=1) != 0
non_zero_idx = non_zero_idx.A1
y_multinomial = y_multinomial[non_zero_idx,:]
train = train.iloc[non_zero_idx, :]

In [None]:
y_multinomial.shape, train.shape

In [None]:
Xtrain, Xtest, Ym_train, Ym_test = train_test_split(train['Title'], y_multinomial, test_size=0.2, random_state=45)

tfid_vec = TfidfVectorizer(tokenizer=lambda x: str(x).split())
Xtrain = tfid_vec.fit_transform(Xtrain)
Xtest = tfid_vec.transform(Xtest)

In [None]:
Xtrain.shape, Xtest.shape

In [None]:
Ym_train.shape, Ym_test.shape

In [None]:
def print_score(y_test, y_pred):
    print("Accuracy :",metrics.accuracy_score(y_test,y_pred))
    print("Macro f1 score :",metrics.f1_score(y_test, y_pred, average = 'macro'))
    print("Micro f1 scoore :",metrics.f1_score(y_test, y_pred, average = 'micro'))
    print("Hamming loss :",metrics.hamming_loss(y_test,y_pred))

Supervised Model

SGDClassifier one vs rest

In [None]:
from datetime import datetime

start = datetime.now()
# create model instance
logreg_model1 = OneVsRestClassifier(SGDClassifier(loss='log',
                                                  alpha=0.001,
                                                  penalty='l1'),
                                   n_jobs=-1)
# train model
logreg_model1.fit(Xtrain, Ym_train)
# predict tags
Ym_test_pred = logreg_model1.predict(Xtest)

# print model performance metrics
print("Accuracy :",metrics.accuracy_score(Ym_test,Ym_test_pred))
print("f1 score macro :",metrics.f1_score(Ym_test,Ym_test_pred, average = 'macro'))
print("f1 score micro :",metrics.f1_score(Ym_test,Ym_test_pred, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(Ym_test,Ym_test_pred))
print("Total Time taken = {}".format(datetime.now() - start))

Logistic regression

In [None]:
 class LogisticRegression(object):
    
    def __init__(Logreg, alpha=0.01, n_iteration=100):   
        Logreg.alpha = alpha                            
        Logreg.n_iter = n_iteration
        
    def _sigmoid_function(Logreg, x): #This function is resonsible for calculating the sigmoid value with given parameter
        value = 1 / (1 + np.exp(-x))
        return value
    def _cost_function(Logreg,h,theta, y): # The fuctions calculates the cost value
        m = len(y)
        cost = (1 / m) * (np.sum(-y.T.dot(np.log(h)) - (1 - y).T.dot(np.log(1 - h))))
        return cost
    
    def _gradient_descent(Logreg,X,h,theta,y,m): # This function calculates the theta value by gradient descent
        gradient_value = np.dot(X.T, (h - y)) / m
        theta -= Logreg.alpha * gradient_value
        return theta

    def predict(Logreg, X): # this function calls the max predict function to classify the individul feauter
        X = np.insert(X, 0, 1, axis=1)
        X_predicted = [max((Logreg._sigmoid_function(i.dot(theta)), c) for theta, c in Logreg.theta)[1] for i in X ]
        return X_predicted

In [None]:
logi = LogisticRegression(n_iteration=30000).fit(X_train_multilabel, y_train)
y_pred1 = logi.predict(Xtest_multilabel)
print_score(Ym_test, y_pred1)
print("Total Time taken = {}".format(datetime.now() - start))

Multilabel KNN

In [None]:
knn = MLkNN(k=10)
knn.fit(Xtrain, Ym_train)
y_pred4 = knn.predict(Xtest)
print_score(Ym_test, y_pred4)


Unsupervised Learning Algorithms

K-Means clustering

In [None]:
X = Xtrain
# Divide into k groups using k-mean clustering
model = KMeans(n_clusters=n_clusters, init='k-m
km = KMeans(n_clusters=1000, init='random', n_init=10, max_iter=300, tol=1e-04, random_state=0)
y_km = km.fit_predict(X)

# plot the centroids
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],s=250, marker='*',c='red', edgecolor='black',label='centroids')
plt.legend(scatterpoints=1)
plt.grid()
plt.show()

Mean Shift

In [None]:
from sklearn.cluster import MeanShift
ms = MeanShift()
ms.fit(X_train_multilabel[:3000,].toarray())
cluster_centers = ms.cluster_centers_

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(Xtrain[:,0], Xtrain[:,1], Xtrain[:,2], marker='o')
ax.scatter(cluster_centers[:,0], cluster_centers[:,1], cluster_centers[:,2], marker='x', color='red', s=300, linewidth=5, zorder=10)
plt.show()
