In [None]:
import pandas as pd

tag_df = pd.read_csv("../input/statsquestions/Tags.csv")
tag_df.head()

In [None]:
questions_df = pd.read_csv("../input/statsquestions/Questions.csv",encoding = 'ISO-8859-1')
questions_df.head()

In [None]:
total_df = pd.merge(questions_df,tag_df,on='Id',how='inner')
total_df.head(2)

In [None]:
concat_tag_df = total_df.groupby(['Id'])['Tag'].apply(",".join).reset_index()
concat_tag_df.head(2)

In [None]:
input_df = pd.merge(questions_df,concat_tag_df,on='Id',how='inner')[['Title','Body','Tag']]
input_df.head()

In [None]:
tags_count_df = tag_df.groupby(['Tag']).count()
tags_count_df_asc = tags_count_df.sort_values(by  = ['Id'])
tags_count_df_asc.query('Id>=3').head()

In [None]:
tags_count_df_desc = tags_count_df.sort_values(by=['Id'],ascending=False)
tags_count_df_desc.head()

In [None]:
%matplotlib inline

from wordcloud import WordCloud
import matplotlib.pyplot as plt

def plot_word_cloud(text):
    word_cloud_instance =    WordCloud(width = 800, height = 800, background_color = 'black',min_font_size = 10).generate(text)
    
    plt.figure(figsize = (8,8), facecolor = None)
    plt.imshow(word_cloud_instance)
    plt.axis('off')
    plt.tight_layout(pad = 0)
    plt.show()

In [None]:
tags = ' '
for index ,row in input_df.iterrows():
    tags = tags + " ," + row['Tag']
plot_word_cloud(tags)    

In [None]:
df_x = input_df[['Title','Body']]
df_y = input_df[['Tag']]

In [None]:
from gensim import utils
import gensim.parsing.preprocessing as gsp


filters =[
    gsp.strip_tags,
    gsp.strip_punctuation,
    gsp.strip_multiple_whitespaces,
    gsp.strip_numeric,
    gsp.remove_stopwords,
    gsp.strip_short,
    gsp.stem_text
]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s    
    

In [None]:
input_df.iloc[0,0]

In [None]:
clean_text(input_df.iloc[0,0])

In [None]:
input_df.iloc[0,1]

In [None]:
clean_text(input_df.iloc[0,1])

In [None]:
titles = ' '
for index,row in input_df.iterrows():
    titles = titles + ' ' + clean_text(row['Title'])
    
plot_word_cloud(titles)    

In [None]:
bodies = ''
for index, row in input_df.iterrows():
    bodies = bodies + ' ' + clean_text(row['Body'])
    
plot_word_cloud(bodies)    

In [None]:
def plot_word_cloud_of_body_for_tag(tag_name):
    tag_specific_body  = ''
    tag_specific_df = input_df[input_df['Tag'].str.contains(tag_name)]
    
    for index,row in tag_specific_df.iterrows():
        tag_specific_body = tag_specific_body + ' ' + clean_text(row['Body'])
        
    plot_word_cloud(tag_specific_body)    

In [None]:
plot_word_cloud_of_body_for_tag('matlab')

In [None]:
plot_word_cloud_of_body_for_tag('probability')

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

y = []
for index,row in df_y.iterrows():
    y.append(set(row['Tag'].split(',')))
    
mlb = MultiLabelBinarizer()
encoded_y = mlb.fit_transform(y)

In [None]:
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils

from tqdm import tqdm
import multiprocessing
import numpy as np

class Doc2VecTransformer(BaseEstimator):
    def __init__(self,vector_size=100,learning_rate=0.02,epochs=1,field=None):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model  = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1
        self.field = field
        
        
    def fit(self,df_x,df_y=None):
        tagged_x = [TaggedDocument(clean_text(row[str(self.field)]).split(), [index]) for index, row in df_x.iterrrows()]
        model = Doc2Vec(documents=tagged_x, vector_size = self.vector_size, workers = self.workers)
        
        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples = len(tagged_x), epochs = 1)
            model.alpha -= self.learning_rate
            model.min_aplha = model.alpha
            
        self._model = model
        return  self
    
    def transform(self,df_x):
        return np.asmatrix(np.array([self.model.infer_vector(clean_text(row[str(self.field)]).split()) for index,row in df_x.iterrows()]))

In [None]:
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(
    df_x, encoded_y)

In [None]:
from sklearn.pipeline import FeatureUnion
fu =  FeatureUnion(transformer_list=[('title_doc2vec',Doc2VecTransformer(field='Title')),
                                 ('body_doc2vec',Doc2VecTransformer(field = 'Body'))])

In [None]:
from sklearn.pipeline import  Pipeline
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import  RandomForestClassifier


In [None]:
binary_rel_model = BinaryRelevance(RandomForestClassifier(n_jobs=-1))
multi_label_rf_br_model = Pipeline(steps=[
    ('feature_union',fu),
    ('binary_relevance',binary_rel_model)
])

In [None]:
import sklearn.metrics as metrics
def hamming_loss(multi_label_model_pipeline,train_x,train_y,test_x,test_y):
    predictions_test_y = multi_label_model_pipeline.predict(test_x)
    return metrics.hamming_loss(y_true=y_test, y_pred = predictions_test_y)
    