In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import nltk
from collections import Counter

from plotly import graph_objs as go
from sklearn import preprocessing 
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from keras.preprocessing import text, sequence

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout, Bidirectional, Conv2D
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf
import transformers
from tokenizers import BertWordPieceTokenizer
from keras.layers import LSTM,Dense,Bidirectional,Input
from keras.models import Model
import torch
import transformers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_auc_score, confusion_matrix, plot_confusion_matrix, plot_precision_recall_curve
import warnings
warnings.simplefilter("ignore")

In [None]:
data = pd.read_csv("../input/mldm-case-study-report/data.csv")
data.head()

In [None]:
df = pd.DataFrame()

In [None]:
df['text'] = data['content_words'].apply(lambda x:' '.join(eval(x)))
df['class'] = data['Level']

In [None]:
df['class'].value_counts()

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# If not previously performed:
nltk.download('stopwords')

stemming = PorterStemmer()
stops = set(stopwords.words("english"))

def apply_cleaning_function_to_list(X):
    cleaned_X = []
    for element in X:
        cleaned_X.append(clean_text(element))
    return cleaned_X


def clean_text(raw_text):
    text = raw_text.lower()
    tokens = nltk.word_tokenize(text)
    token_words = [w for w in tokens if w.isalpha()]
    
    stemmed_words = [stemming.stem(w) for w in token_words]
    
    meaningful_words = [w for w in stemmed_words if not w in stops]
    
    joined_words = ( " ".join(meaningful_words))
    
    return joined_words

In [None]:
def create_bag_of_words(X):
    from sklearn.feature_extraction.text import CountVectorizer
    
    print ('Creating bag of words...')
    
    vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 ngram_range = [2,2], \
                                 max_features = 10000
                                ) 

    train_data_features = vectorizer.fit_transform(X)
    train_data_features = train_data_features.toarray()
    
    from sklearn.feature_extraction.text import TfidfTransformer
    tfidf = TfidfTransformer()
    tfidf_features = tfidf.fit_transform(train_data_features).toarray()

    vocab = vectorizer.get_feature_names()
   
    return vectorizer, vocab, train_data_features, tfidf_features, tfidf

In [None]:
def sampling_k_elements(group, k=3):
    if len(group) < k:
        return group
    return group.sample(k)

df_balanced = df.groupby('class').apply(sampling_k_elements,500).reset_index(drop=True)

In [None]:
df_balanced['class'].value_counts()

In [None]:
text_to_clean= list(df_balanced['text'])
cleaned_text = apply_cleaning_function_to_list(text_to_clean)
df_balanced['text']=cleaned_text
df_balanced.head()

In [None]:
from sklearn.model_selection import train_test_split
X = df_balanced['text']
y = df_balanced['class']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25)

In [None]:
vectorizer, vocab, train_data_features, tfidf_features, tfidf = create_bag_of_words(X_train)

In [None]:
bag_dictionary = pd.DataFrame()
bag_dictionary['ngram'] = vocab
bag_dictionary['count'] = train_data_features[0]
bag_dictionary['tfidf_features'] = tfidf_features[0]

bag_dictionary.sort_values(by=['count'], ascending=False, inplace=True)
# Show top 10
print(bag_dictionary.head(10))

**1. LOGISTIC REGRESSION (TFIDF MODEL)**

In [None]:
from sklearn.linear_model import LogisticRegression

def train_logistic_regression(features, label):
    print ("Training the logistic regression model...")
    from sklearn.linear_model import LogisticRegression
    ml_model = LogisticRegression(C = 100,random_state = 0)
    ml_model.fit(features, label)
    print ('Finished')
    return ml_model

In [None]:
ml_model = train_logistic_regression(tfidf_features, y_train)

In [None]:
test_data_features = vectorizer.transform(X_test)
test_data_features = test_data_features.toarray()

In [None]:
test_data_tfidf_features = tfidf.fit_transform(test_data_features)
test_data_tfidf_features = test_data_tfidf_features.toarray()

In [None]:
predicted_y = ml_model.predict(test_data_tfidf_features)
pred_prob = ml_model.predict_proba(test_data_tfidf_features)
correctly_identified_y = predicted_y == y_test
accuracy = np.mean(correctly_identified_y) * 100
print ('Accuracy = ',accuracy,"%")

In [None]:
from sklearn.metrics import roc_curve
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)

plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='HQ vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='LQ(Close) vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='LQ(Open) vs Rest')
plt.plot([0,1], linestyle='-', lw=2, color='r', label='random', alpha=0.8)
plt.title('Multiclass ROC curve - Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);    

**2. NAIVE BAYES (TFIDF MODEL)**

In [None]:
nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_features,y_train)

In [None]:
predicted_y = nb_classifier.predict(test_data_tfidf_features)
pred_prob = nb_classifier.predict_proba(test_data_tfidf_features)
correctly_identified_y = predicted_y == y_test
accuracy = np.mean(correctly_identified_y) * 100
print ('Accuracy = ',accuracy,"%")

In [None]:
from sklearn.metrics import roc_curve
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)

plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='HQ vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='LQ(Close) vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='LQ(Open) vs Rest')
plt.plot([0,1], linestyle='-', lw=2, color='r', label='random', alpha=0.8)
plt.title('Multiclass ROC curve - Naive Bayes')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);    

**3. DECISION TREE (TFIDF MODEL)**

In [None]:
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(tfidf_features,y_train)

In [None]:
predicted_y = dt_classifier.predict(test_data_tfidf_features)
pred_prob = dt_classifier.predict_proba(test_data_tfidf_features)
correctly_identified_y = predicted_y == y_test
accuracy = np.mean(correctly_identified_y) * 100
print ('Accuracy = ',accuracy,"%")

In [None]:
from sklearn.metrics import roc_curve
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)

plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='HQ vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='LQ(Close) vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='LQ(Open) vs Rest')
plt.plot([0,1], linestyle='-', lw=2, color='r', label='random', alpha=0.8)
plt.title('Multiclass ROC curve - Decision Tree')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);    

In [None]:
df = pd.read_csv("../input/60k-stack-overflow-questions-with-quality-rate/train.csv")
df['text'] = df['Title'] + " " + df['Body']
cols_to_drop = ['Id', 'Tags', 'CreationDate', 'Title', 'Body']
df.drop(cols_to_drop, axis=1, inplace=True)
df = df.rename(columns={"Y": "class"})

stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^(a-zA-Z)\s]','', text)
    return text
df['text'] = df['text'].apply(clean_text)

label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'class'. 
df['class']= label_encoder.fit_transform(df['class']) 
df['class'].unique() 

x_train,x_test,y_train,y_test = train_test_split(df['text'], df['class'], test_size = 0.2, random_state = 42, stratify = df['class'])
max_features = 10000
maxlen = 300

In [None]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)
tokenized_train = tokenizer.texts_to_sequences(x_train)
x_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

In [None]:
tokenized_test = tokenizer.texts_to_sequences(x_test)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

In [None]:
EMBEDDING_FILE = '../input/glove-twitter/glove.twitter.27B.200d.txt'

In [None]:
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
#change below line if computing normal stats is too slow
embedding_matrix = embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
batch_size = 256
epochs = 2
embed_size = 200

In [None]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', patience = 2, verbose=1,factor=0.4, min_lr=0.0000001)

In [None]:
maxlen

In [None]:
model = Sequential()
model.add(Embedding(max_features, output_dim=embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=True))
model.add(Bidirectional(LSTM(units=128 , return_sequences = True , recurrent_dropout = 0.4 , dropout = 0.4)))
model.add(Bidirectional(LSTM(units=128 , recurrent_dropout = 0.2 , dropout = 0.2)))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer=keras.optimizers.Adam(lr = 0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(x_train, y_train, batch_size = batch_size , validation_data = (X_test,y_test) , epochs = epochs , callbacks = [learning_rate_reduction])

In [None]:
print("Accuracy of the model on Training Data is - " , model.evaluate(x_train,y_train)[1]*100 , "%")
print("Accuracy of the model on Testing Data is - " , model.evaluate(X_test,y_test)[1]*100 , "%")

In [None]:
epochs = [i for i in range(2)]
fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
val_acc = history.history['val_accuracy']
val_loss = history.history['val_loss']
fig.set_size_inches(20,10)

ax[0].plot(epochs , train_acc , 'go-' , label = 'Training Accuracy')
ax[0].plot(epochs , val_acc , 'ro-' , label = 'Testing Accuracy')
ax[0].set_title('Training & Testing Accuracy')
ax[0].legend()
ax[0].set_xlabel("Epochs")
ax[0].set_ylabel("Accuracy")

ax[1].plot(epochs , train_loss , 'go-' , label = 'Training Loss')
ax[1].plot(epochs , val_loss , 'ro-' , label = 'Testing Loss')
ax[1].set_title('Training & Testing Loss')
ax[1].legend()
ax[1].set_xlabel("Epochs")
ax[1].set_ylabel("Loss")
plt.show()

In [None]:
pred = model.predict_classes(X_test)

In [None]:
print(classification_report(y_test, pred, target_names = ['HQ', 'LQ(Close)', 'LQ(Open)']))

In [None]:
cm = confusion_matrix(y_test,pred)
cm = pd.DataFrame(cm , index = ['HQ', 'LQ(Close)', 'LQ(Open)'] , columns = ['HQ', 'LQ(Close)', 'LQ(Open)'])

plt.figure(figsize = (10,10))
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['HQ', 'LQ(Close)', 'LQ(Open)'] , yticklabels = ['HQ', 'LQ(Close)', 'LQ(Open)'])
plt.xlabel("Predicted")
plt.ylabel("Actual")

In [None]:
pred_prob = model.predict_proba(X_test)

In [None]:
pred_prob

In [None]:
from sklearn.metrics import roc_curve
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)

plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='HQ vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='LQ(Close) vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='LQ(Open) vs Rest')
plt.plot([0,1], linestyle='-', lw=2, color='r', label='random', alpha=0.8)
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);    

In [None]:
def predict(text):
    text = clean_text(text);
    x_test = sequence.pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=maxlen)
    score = model.predict([x_test])[0]

    return score  

In [None]:
cols= ['HQ','LQ(CLOSE)','LQ(OPEN)']
cols[predict("I am developing a media player using vlc-qt , Actually I want a button which will play do the fast backward operation. I don't have the problem with the fast forward operation but not able to implement the fast backward operation, Is there any function there in vlc-qt which will play the video backwards. Here are the buttons code which I am using for fast forward and fast backward operation").argmax()]

In [None]:
cols[predict(df['text'][7]).argmax()]

In [None]:
cols[predict(df['text'][0]).argmax()]

In [None]:
cols[predict(df['text'][2]).argmax()]

In [None]:
df['text'][7]

In [None]:
df['text'][0]

In [None]:
df['text'][2]