In [15]:
import os, json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import f1_score
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD



In [16]:
# Feature Extraction Cell

# this finds our json files
path_to_json = '../data_folder/json_train/'

stop_list = list(STOPWORDS) + ["sil", "uh", "um"]

VOCAB = set()
score_dict = {}
tot = {}
voc_dict = {}
vocab_list = []

new_dict = {}
new_dict["f_1"] = []
new_dict["f_2"] = []
new_dict["f_3"] = []
new_dict["scores"] = []
new_dict["binary_label"] = []
new_dict["multiclass_label"] = []

data = []
print("Extraction features....")
setter = set()
inter = set()
# we need both the json and an index number so use enumerate()
for subdir, dirs, files in os.walk(path_to_json):
    for file in files:
        if file.endswith(".json"):
            path = os.path.join(subdir, file)
            with open(path, 'r') as f:
                json_text = json.load(f)
            id_ = file
            score = json_text["score"]
            
            if id_ in tot.keys():
                continue
                
            if score < 5:
                print("Outlier, Score less than 5")
                continue
            
            if not json_text["tokens"]:
                print("Outlier, no tokens")
                continue
                
            text = ""
            doc_vocab = set()
            counter = 0
            for tok in json_text["tokens"]:
                Text = tok["text"].lower()
                if Text not in stop_list:
                    text += " " + Text
                    counter += 1
                    doc_vocab.add(Text)
                    VOCAB.add(Text)
                    vocab_list.append(Text)

                
            data.append(text)
            new_dict["scores"].append(score)
            
            # new words pr min
            f_1 = len(doc_vocab) / json_text["elapsed_time"]
            if (f_1 < 0.15):
                print('ERRORRRRRR')
                print(subdir)
                
            feature_dict = {}            
            feature_dict["new_words_pr_min"] = f_1
            
            # repeated words pr min
            f_2 = len(doc_vocab) / (counter * json_text["elapsed_time"])
            feature_dict["repeated_words_pr_min"] = f_2

            new_dict["f_1"].append(f_1)
            new_dict["f_2"].append(f_2)

            feature_dict["time"] = json_text["elapsed_time"]
            tot[id_] = feature_dict

            voc_dict[id_] = doc_vocab

            union = setter.union(doc_vocab - inter)
            intersect = setter.intersection(doc_vocab)
            setter = union - intersect
            inter = intersect

            # Labelling process

            # Goal: set thresholds to get uniform distribution
            if score > 91:
                new_dict["binary_label"].append(1)
                if score > 95.5:
                    new_dict["multiclass_label"].append(3)
                else:
                    new_dict["multiclass_label"].append(2)
            else:
                new_dict["binary_label"].append(0)
                if (score < 91) & (score > 79):
                    new_dict["multiclass_label"].append(1)
                else:
                    new_dict["multiclass_label"].append(0)

# f_3
for id_ in tot.keys():
    voc = voc_dict[id_]
    time = tot[id_]["time"]
    f_3 = len(voc.intersection(setter)) / (time)
    new_dict["f_3"].append(f_3)




Extraction features....
Outlier, Score less than 5
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens
Outlier, no tokens


In [17]:
print("training on",len(tot.keys()),"data points")


training on 3230 data points


In [18]:
print("create dataframe from dictionary")
df = pd.DataFrame.from_dict(new_dict)


create dataframe from dictionary


In [19]:
#TF-IDF 
# Feature Extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)
transformer = TfidfTransformer()
data = transformer.fit_transform(X).toarray()

print("Normalizing data")


def normalizer(df):
    result = df.copy()
    for feature_name in df.columns:
        if "f_" in feature_name:
            max_value = df[feature_name].max()
            min_value = df[feature_name].min()
            result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result


df = normalizer(df)

print("Preprocessing done")


Normalizing data
Preprocessing done


In [20]:

# Training

print("--------------------------------------")
print("F1-F3")
print("Preparing binary classification task")

y = df.iloc[:, -2]
X = df.iloc[:, :-3]
average = "binary"

def algo(a_train, a_test, b_train, b_test, pr_cl):

    ############Logistic Regression #####################


    LR = LogisticRegression(solver='lbfgs', multi_class='ovr').fit(a_train, b_train)
    predicted = LR.predict(a_test)


    ############Random Forest classifier#####################


    rf = RandomForestClassifier()
    rf.fit(a_train, b_train)
    predictions = rf.predict(a_test)


    # ######################## Naive Bayes classifier###############

    gnb = GaussianNB()
    gnb.fit(a_train, b_train)
    prediction = gnb.predict(a_test)


    # Model Accuracy, how often is the classifier correct?

    ########################## Gradient Classifier###################


    gb = GradientBoostingClassifier()
    gb.fit(a_train, b_train)

    if pr_cl == "binary":
        print("Accuracy, logistic regression: {0:.3f}".format(LR.score(a_test, b_test)))
        print("F1 score, logistic regression: {0:.3f}".format(f1_score(predicted, b_test)))

        print("Accuracy for RandomForestClassifier: {0:.3f}".format(metrics.accuracy_score(b_test, predictions)))
        print("F1 score, RandomForestClassifier: {0:.3f}".format(f1_score(predictions, b_test)))

        print("Accuracy for Naive Bayes Classifier: {0:.3f}".format(metrics.accuracy_score(b_test, prediction)))
        print("F1 score, Naive Bayes: {0:.3f}".format(f1_score(prediction, b_test)))

        print("Accuracy for Gradient Boosting: {0:.3f}".format(gb.score(a_test, b_test)))
        print("F1 score, Gradient Boosting: {0:.3f}".format(f1_score(predictions, b_test)))

    else:
        print("Accuracy, logistic regression: {0:.3f}".format(LR.score(a_test, b_test)))
        print("F1 score, logistic regression: {0:.3f}".format(f1_score(predicted, b_test, average='weighted')))

        print("Accuracy for RandomForestClassifier: {0:.3f}".format(metrics.accuracy_score(b_test, predictions)))
        print("F1 score, RandomForestClassifier: {0:.3f}".format(f1_score(predictions, b_test, average='weighted')))

        print("Accuracy for Naive Bayes Classifier: {0:.3f}".format(metrics.accuracy_score(b_test, prediction)))
        print("F1 score, Naive Bayes: {0:.3f}".format(f1_score(prediction, b_test, average='weighted')))

        print("Accuracy for Gradient Boosting: {0:.3f}".format(gb.score(a_test, b_test)))
        print("F1 score, Gradient Boosting: {0:.3f}".format(f1_score(predictions, b_test, average='weighted')))



a_train, a_test, b_train, b_test = train_test_split(X, y, test_size=0.20, random_state=42)
algo(a_train, a_test, b_train, b_test, average)

print("--------------------------------------")
print("F1-F3")
print("Preparing MULTICLASS classification task")

y = df.iloc[:, -1]
average = "weighted"
a_train, a_test, b_train, b_test = train_test_split(X, y, test_size=0.20, random_state=42)
algo(a_train, a_test, b_train, b_test, average)


--------------------------------------
F1-F3
Preparing binary classification task
Accuracy, logistic regression: 0.704
F1 score, logistic regression: 0.661
Accuracy for RandomForestClassifier: 0.619
F1 score, RandomForestClassifier: 0.579
Accuracy for Naive Bayes Classifier: 0.669
F1 score, Naive Bayes: 0.567
Accuracy for Gradient Boosting: 0.695
F1 score, Gradient Boosting: 0.579
--------------------------------------
F1-F3
Preparing MULTICLASS classification task
Accuracy, logistic regression: 0.379
F1 score, logistic regression: 0.483
Accuracy for RandomForestClassifier: 0.401
F1 score, RandomForestClassifier: 0.408
Accuracy for Naive Bayes Classifier: 0.398
F1 score, Naive Bayes: 0.472
Accuracy for Gradient Boosting: 0.416
F1 score, Gradient Boosting: 0.408


In [21]:


# ONLY TF-IDF AS PREDICTIVE FEATURE
print("HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH")
print("TF-IDF")
print("Preparing BINARY classification task")
y = df.iloc[:, -2]
average = "binary"
a_train, a_test, b_train, b_test = train_test_split(data, y, test_size=0.20, random_state=42)
algo(a_train, a_test, b_train, b_test, average)

##################################################################

print("--------------------------------------")
print("Preparing MULTICLASS classification task")

y = df.iloc[:, -1]
average = "weighted"
a_train, a_test, b_train, b_test = train_test_split(data, y, test_size=0.20, random_state=42)
algo(a_train, a_test, b_train, b_test, average)



HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
TF-IDF
Preparing BINARY classification task
Accuracy, logistic regression: 0.684
F1 score, logistic regression: 0.715
Accuracy for RandomForestClassifier: 0.641
F1 score, RandomForestClassifier: 0.590
Accuracy for Naive Bayes Classifier: 0.596
F1 score, Naive Bayes: 0.590
Accuracy for Gradient Boosting: 0.676
F1 score, Gradient Boosting: 0.590
--------------------------------------
Preparing MULTICLASS classification task
Accuracy, logistic regression: 0.443
F1 score, logistic regression: 0.442
Accuracy for RandomForestClassifier: 0.402
F1 score, RandomForestClassifier: 0.402
Accuracy for Naive Bayes Classifier: 0.356
F1 score, Naive Bayes: 0.356
Accuracy for Gradient Boosting: 0.443
F1 score, Gradient Boosting: 0.402


In [22]:

print("Run SVD on Data frame to deal with sparsity")


# Training -------------------------------------

raw_data = normalize(data, axis = 0)

svd = TruncatedSVD(n_components=5)
svd.fit(raw_data)
new_data = svd.transform(raw_data)

print("Preparing BINARY classification task, TF-IDF with SVD")
# With SVD
y = df.iloc[:, -2]
average = "binary"
a_train, a_test, b_train, b_test = train_test_split(new_data, y, test_size=0.20, random_state=42)
algo(a_train, a_test, b_train, b_test, average)


print("--------------------------------------")
print("Preparing MULTICLASS classification task, TF-IDF with SVD")

y = df.iloc[:, -1]
average = "weighted"
a_train, a_test, b_train, b_test = train_test_split(new_data, y, test_size=0.20, random_state=42)
algo(a_train, a_test, b_train, b_test, average)



Run SVD on Data frame to deal with sparsity
Preparing BINARY classification task, TF-IDF with SVD
Accuracy, logistic regression: 0.534
F1 score, logistic regression: 0.642
Accuracy for RandomForestClassifier: 0.599
F1 score, RandomForestClassifier: 0.551
Accuracy for Naive Bayes Classifier: 0.550
F1 score, Naive Bayes: 0.088
Accuracy for Gradient Boosting: 0.579
F1 score, Gradient Boosting: 0.551
--------------------------------------
Preparing MULTICLASS classification task, TF-IDF with SVD
Accuracy, logistic regression: 0.344
F1 score, logistic regression: 0.444
Accuracy for RandomForestClassifier: 0.365
F1 score, RandomForestClassifier: 0.367
Accuracy for Naive Bayes Classifier: 0.293
F1 score, Naive Bayes: 0.402
Accuracy for Gradient Boosting: 0.398
F1 score, Gradient Boosting: 0.367


  'recall', 'true', average, warn_for)


In [23]:


################### Regression########################
print("--------------------------------------")
print("Preparing regression task")

def regressor(a_train, a_test, b_train, b_test):
    LinR = LinearRegression().fit(a_train, b_train)
    predicted = LinR.predict(a_test)
    rmse = sqrt(mean_squared_error(predicted, b_test))
    print("Root Mean square error")
    print(rmse)


y = df.iloc[:, -3]


print("F1-F3")
a_train, a_test, b_train, b_test = train_test_split(X, y, test_size=0.20, random_state=42)
regressor(a_train, a_test, b_train, b_test)



print("TF-IDF")
a_train, a_test, b_train, b_test = train_test_split(data, y, test_size=0.20, random_state=42)
regressor(a_train, a_test, b_train, b_test)


print("TF-IDF, SVD")
a_train, a_test, b_train, b_test = train_test_split(new_data, y, test_size=0.20, random_state=42)
regressor(a_train, a_test, b_train, b_test)




--------------------------------------
Preparing regression task
F1-F3
Root Mean square error
13.687577918345099
TF-IDF
Root Mean square error
11.911533775624772
TF-IDF, SVD
Root Mean square error
13.03805680874539
