In [1]:
    #General
import pandas as pd
import numpy as np


    #Plotting
import matplotlib.pyplot as plt
import seaborn as sns


    #Sklearn Packages
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.feature_extraction import stop_words, text
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

%config InlineBackend.figure_format = 'retina'

  import pandas.util.testing as tm


#### Reading in DataFrame

In [2]:
df = pd.read_csv('./Datasets/cleaned.csv')

In [3]:
df.isnull().sum()

questions_id               0
questions_author_id        0
questions_date_added       0
questions_title            0
questions_body             0
questions_score            0
tag_id                     0
tag_name                   0
answers_id              2340
answers_author_id       2340
answers_date_added      2340
answers_body              24
answers_score              0
qa_match                   0
dtype: int64

#### Removing 'answer' columns from dataframe.

Since we don't want any bleeding in our prediction data, we're removing all the 'answer' columns. This also get's rid of all our null values. We're keeping `qa_match` because that will be our y-label.

In [4]:
#Using a for loop to select all our columns with answer/answers in the name
answer_cols = []
for cols in df.columns:
    if 'answer' in cols:
        answer_cols.append(cols)
        
#Dropping answe_cols
df.drop(columns = answer_cols, inplace= True)

## Preprocessing

#### Transforming Data With FunctionTransformer

## <span style = 'color:red'> Why?

In [9]:
get_text_data = FunctionTransformer(lambda x: x['questions_body'], validate = False)
get_numeric_data = FunctionTransformer(lambda x: x[['tag_id']], validate = False)

## Modeling

#### Instantiating our X and y variables

In [10]:
y = df['qa_match']
X = df[['questions_body', 'tag_id']]

#### Train Test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   random_state=42)

## Logicstic Regression

#### Building a Pipeline to Grid Search Using Standard Scaler, Count Vectorizer

In [12]:
pipe_logreg = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
            ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer())
            ]))
    ])),
    ('logreg', LogisticRegression(solver='liblinear'))
])

params = {
           'logreg__penalty' : ['l1', 'l2']
}

gs = GridSearchCV(pipe_logreg, params, cv=5)

gs.fit(X_train, y_train)
print("train score", gs.score(X_train, y_train))
print("test score", gs.score(X_test, y_test))
print("best params:", gs.best_params_)

train score 0.9946112564864505
test score 0.9930811194393933
best params: {'logreg__penalty': 'l2'}


## KNN

In [None]:
pipe_knn = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
            ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer())
            ]))
    ])),
    ('knn', KNeighborsClassifier())

])

params = {
    'knn__n_neighbors' : [3, 5, 10, 15, 20],
#     'knn__metric': ['euclidean', 'manhattan']  #Takes a while to run

}

gs = GridSearchCV(pipe_knn, params, cv=5)

gs.fit(X_train, y_train)
print("train score:", gs.score(X_train, y_train))
print("test score", gs.score(X_test, y_test))
print("best params:", gs.best_params_)

## Decision Tree

In [None]:
pipe_knn = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
            ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer())
            ]))
    ])),
    ('dt', DecisionTreeClassifier(random_state = 42))

])

params = {
    'dt__max_depth': [3, 5],  
    'dt__min_samples_split': [3, 5, 7],
    'dt__min_samples_leaf': [3, 5], 
#     'dt__random_state': 42
#     'knn__n_neighbors' : [3, 5, 10, 15, 20]
#             'cvec__stop_words' : [None, 'english'],
#            'logreg__penalty' : ['l1', 'l2']
}

gs = GridSearchCV(pipe_knn, params, cv=5)

gs.fit(X_train, y_train)
print("train score:", gs.score(X_train, y_train))
print("test score:", gs.score(X_test, y_test))
print("best params:", gs.best_params_)

## Random Forest

In [None]:
pipe_knn = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
            ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer())
            ]))
    ])),
    ('rf', RandomForestClassifier(random_state=42))

])

params = {
    'rf__n_estimators': [100,125],
    'rf__max_depth': [None, 4, 5, 6],
    'rf__max_features': [None,"auto"]}
    


gs = GridSearchCV(pipe_knn, params, cv=5)

gs.fit(X_train, y_train)
print("train score:", gs.score(X_train, y_train))
print("test score:", gs.score(X_test, y_test))
print("best params:", gs.best_params_)

#### Setting up a Function that Returns a Confusion Matrix as a DataFrame

In [13]:
def make_confusion(y_test, preds, classes):

    conmat = confusion_matrix(y_test, preds)
    print(f'Accuracy Score: {accuracy_score(y_test, preds)}')
    print(f'Precision Score: {precision_score(y_test, preds)}')
    print(f'Recall Score: {recall_score(y_test, preds)}')
    return pd.DataFrame(conmat, columns=['Predicted ' +class_ for class_ in classes], \
                index=['Actual '+ class_ for class_ in classes])

#### Calling `make_confusion` function to get Accuracy, Precision and Recalls Scores and Confusion Matrix

In [14]:
# build a function to print out a nice confusion matrix
preds = gs.best_estimator_.predict(X_test)

make_confusion(y_test, preds, ["wasn't answered", "was answered"])

Accuracy Score: 0.9930811194393933
Precision Score: 0.9933224646581951
Recall Score: 0.9997078060731384


Unnamed: 0,Predicted wasn't answered,Predicted was answered
Actual wasn't answered,304,299
Actual was answered,13,44478


## Coefficients - Words Most Indicitive if the Question Will Get Answered or Not

#### Setting up Logicstic Regression For Extracting Coefficients

In [16]:
pipe_logreg = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
            ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer())
            ]))
    ])),
    ('logreg', LogisticRegression(solver='liblinear'))
])

params = {
           'logreg__penalty' : ['l1', 'l2']
}

gs = GridSearchCV(pipe_logreg, params, cv=5)

gs.fit(X_train, y_train)
print("train score", gs.score(X_train, y_train))
print("test score", gs.score(X_test, y_test))
print("best params:", gs.best_params_)

train score 0.996281840895315
test score 0.9948773672772431
best params: {'logreg__penalty': 'l2'}


#### Extracting Features and Coefficients

In [15]:
features = ['word_count', 'sentiment_score'] +\
gs.best_estimator_.named_steps['features'].transformer_list[1][1].named_steps['cvec'].get_feature_names()


coefficients = gs.best_estimator_.named_steps['logreg'].coef_[0]

#### Creating A Data Frame with Coefficients and Exponentiated Coefficients

In [16]:
coef_df = pd.DataFrame({'features': features, 
              'coef' : coefficients,
              'exp_coef': [np.exp(coef) for coef in coefficients] #exponentiated coefficients
             })

coef_df

ValueError: arrays must all be same length

#### Viewing the Words that are Most Indicitive for Question Being Answered

In [None]:
coef_df = coef_df.set_index('features')
coef_df = coef_df.sort_values('exp_coef', ascending = False)
coef_df.head(10)

#### Viewing the Words that are Most Indicitive for Questions Not Being Answered

In [None]:
coef_df.tail(10)

#### Graphing Words Most Indicitive to Having Questions Answered

In [None]:
weights = coef_df['exp_coef'].head(10).sort_values()
labels = weights.index

plt.figure(figsize=(20,10))
plt.barh(labels, weights, color = 'orange')
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.xlabel('Exponential Coefficient', fontsize=30)
plt.title(f'Top 10 Features Predicting that Question Will Be Answered ', fontsize=42)
plt.tight_layout()

#### Graphing Words Most Indicitive from Not Getting Answered

In [None]:
weights = coef_df['exp_coef'].tail(10).sort_values()
labels = weights.index

plt.figure(figsize=(20,10))
plt.barh(labels, weights, color = 'skyblue')
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.xlabel('Exponential Coefficient', fontsize=30)
plt.title(f"Top 10 Features Predicting it Won't be Answered", fontsize=42)
plt.tight_layout()