In [None]:
## for data
import scipy
import json
import pandas as pd
import numpy as np

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

## for explainer
from lime import lime_text

## for word embedding
import gensim
import gensim.downloader as gensim_api

## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

## for bert language model
import transformers

import re

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
dtf = pd.read_csv('C:/Users/likki/Downloads/webscraping_stack4.csv')    # reading the csv file
dtf.head()      # printing first five rows of the file

In [None]:
"""Data Overview"""

print('Rows x Columns : ', dtf.shape[0], 'x', dtf.shape[1])
print('Features: ', dtf.columns.tolist())
print('\nUnique values:')
print(dtf.nunique())

In [None]:
"""type of entries and missing values"""

dtf.info()
print('\nMissing values:  ', dtf.isnull().sum().values.sum())
dtf.isnull().sum()


In [None]:
dtf.describe().T

In [None]:
# eliminate null values
dtf=dtf.replace(np.nan, 'Not Available')
dtf

In [None]:
dtf['Content']=dtf['Content'].str.replace("[\n]", " ") 
dtf['Number_of_Answers']=dtf['Number_of_Answers'].str.replace("[\n]", " ") 
dtf['Number_of_Answers']=dtf['Number_of_Answers'].str.replace("[answer]", " ") 
dtf.drop(dtf.columns[[0]], axis = 1, inplace = True)
dtf.drop(dtf.columns[[6]], axis = 1, inplace = True)
dtf = dtf.reindex(sorted(dtf.columns), axis=1)
dtf=dtf.sort_values(by=['Category','Votes'] ,ascending=False)


In [None]:
dtf

In [None]:
fig, ax = plt.subplots()
fig.suptitle("Category", fontsize=12)
dtf["Category"].reset_index().groupby("Category").count().sort_values(by="index").plot(kind="barh", legend=False,ax=ax).grid(axis='x')
plt.show()

In [None]:
dtf 

In [None]:
dtf['Tag']=dtf['Tag'].str.replace("'","")
dtf['Tag']=dtf['Tag'].str.replace("[","")
dtf['Tag']=dtf['Tag'].str.replace("]","")
dtf

In [None]:
dtf

In [None]:
for i in range(len(dtf['Tag'])):
    dtf['Tag'][i]=list(dtf['Tag'][i].split(","))
    #print(dtf['Tag'][i])

In [None]:
dtf

In [None]:
print(type(dtf['Tag']))

In [None]:
row=[]
for i in range(len(dtf['Tag'])):
    #print(dtf['Tag'][i])
    row.append(dtf['Tag'][i])
    
    

In [None]:
tags=[]
for i in row:
    for j in i:
        tags.append(j.strip())

In [None]:
all_tags=list(set(tags))

In [None]:
len(all_tags)

In [None]:
import nltk
all_tags = nltk.FreqDist(tags) 

In [None]:
all_tags

In [None]:
# create dataframe
tags_df = pd.DataFrame({'Tag': list(all_tags.keys()), 
                              'Count': list(all_tags.values())})

In [None]:
tags_df

In [None]:
g = tags_df.nlargest(columns="Count", n = 50) 
plt.figure(figsize=(12,15)) 
ax = sns.barplot(data=g, x= "Count", y = "Tag") 
ax.set(ylabel = 'Count') 
plt.show()

In [None]:
#function for text cleaning 
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text



In [None]:
# remove samples with 0 genre tags
tags_new = dtf[~(dtf['Tag'].str.len() == 0)]

In [None]:
tags_new

In [None]:

tags_new['Cleaned_Topics'] = tags_new['Topics'].apply(lambda x: clean_text(x))
tags_new

In [None]:
def freq_words(x, terms = 30): 
    all_words = ' '.join([text for text in x]) 
    all_words = all_words.split() 
    fdist = nltk.FreqDist(all_words) 
    words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) 

    # selecting top 20 most frequent words 
    d = words_df.nlargest(columns="count", n = terms) 

    # visualize words and frequencies
    plt.figure(figsize=(12,15)) 
    ax = sns.barplot(data=d, x= "count", y = "word") 
    ax.set(ylabel = 'Word') 
    plt.show()

# print 100 most frequent words 
freq_words(tags_new['Cleaned_Topics'], 100)

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

tags_new['Cleaned_Topics'] = tags_new['Cleaned_Topics'].apply(lambda x: remove_stopwords(x))

In [None]:
tags_new

In [None]:
freq_words(tags_new['Cleaned_Topics'], 100)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(tags_new['Tag'])

# transform target variable
y = multilabel_binarizer.transform(tags_new['Tag'])

In [None]:
y

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

In [None]:
tfidf_vectorizer

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(tags_new['Cleaned_Topics'], y, test_size=0.2, random_state=9)

In [None]:
# create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [None]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

In [None]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [None]:
# fit model on train data
clf.fit(xtrain_tfidf, ytrain)

In [None]:
# make predictions for validation set
y_pred = clf.predict(xval_tfidf)

In [None]:
y_pred[3]

In [None]:
multilabel_binarizer.inverse_transform(y_pred)[3]

In [None]:
# evaluate performance
f1_score(yval, y_pred, average="micro")

In [None]:
# predict probabilities
y_pred_prob = clf.predict_proba(xval_tfidf)

In [None]:
t = 0.3 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)

In [None]:
# evaluate performance
f1_score(yval, y_pred_new, average="micro")

In [None]:
def infer_tags(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = clf.predict(q_vec)
    return multilabel_binarizer.inverse_transform(q_pred)

In [None]:
tags_new

In [None]:
for i in range(10): 
    k = xval.sample(1).index[0] 
    print("Tag: ", tags_new['Topics'][k], "\nPredicted tags: ", infer_tags(xval[k])), print("Actual tag: ",tags_new['Tag'][k], "\n")
