# Modeling

In [1]:
# Import Dependencies

# General
import pickle
import pandas as pd
import itertools

# Modeling functions
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier

# Display
from pprint import pprint

In [2]:
data_path = "/Users/seanosier/data/Metis/Wiki/"

In [3]:
# Pickling functions
def pickle_it(data, filename, python_version=3):
    """
    In:
        data = the data you want to pickle (save)
        filename = file name where you want to save the data
        python_version = the python version where you will be opening the pickle file
    
    Out:
        Saves a pickle file with your data to to the filename you specify
    """
    with open(filename, "wb") as picklefile:
        pickle.dump(data, picklefile, protocol=python_version)

def load_pickle(filename):
    """
    In:
        filename = name of the pickle file you want to open (e.g "my_pickle.pkl")
    
    Out:
        Opens and returns the content of the picklefile to a variable of your choice
    """
    with open(filename, "rb") as picklefile: 
        return pickle.load(picklefile)

In [4]:
# Load and reaggregate article data
article_data_a = load_pickle(data_path + "article_data_a.pkl")
article_data_b = load_pickle(data_path + "article_data_b.pkl")
article_data = article_data_a + article_data_b

# Load labels for english sentences
english_sentences_with_labels = load_pickle(data_path + "english_sentences_with_labels.pkl")

In [5]:
# Note english_sentences_with_labels list needs to flattened to the sentence level
len(article_data), len(english_sentences_with_labels)

(13158106, 113298)

In [6]:
# Flatten english_sentences_with_labels list
english_sentences_with_labels = [sentence_label_pair for article in english_sentences_with_labels \
                                 for sentence_label_pair in article]

english_sentences = english_sentences_with_labels[::2]
sentence_labels = english_sentences_with_labels[1::2]

english_sentences = [sentence for article in english_sentences for sentence in article]
sentence_labels = [label for article in sentence_labels for label in article]

In [7]:
len(english_sentences), len(sentence_labels)

(13158106, 13158106)

In [8]:
# Confirm sentences in each list match up correctly
print(english_sentences[13158105])
print(article_data[13158105])

For example, the 8-bit string "11111111" above represents 2&nbsp;−&nbsp;1&nbsp;=&nbsp;255.
['For example, the 8-bit string "11111111" above represents 2&nbsp;−&nbsp;1&nbsp;=&nbsp;255.', 4, 6, 30, 46, 0.8, 0.8571428571428571, 0.967741935483871, 0.9787234042553191, 0, 1, 4, 0.0, 0.5, 0.8, 1, 0.5, 5, 0.8333333333333334, 5, 0.8333333333333334, 47, 90, 0, 0, 0, 0, 0, 0.0, 0.0, 0.1]


In [9]:
# Combine the data and labels and convert to Pandas DataFrame
data_for_modeling = [data + [label] for data, label in zip(article_data, sentence_labels)]
column_names = ["sentence", "cum_sect", "cum_subsect", "cum_para", "cum_sent", "cum_sect_%", "cum_subsect_%", 
                    "cum_para_%", "cum_sent_%", "subsect_in_sect", "para_in_subsect", "sent_in_para", 
                    "subsect_in_sect_%", "para_in_subsect_%", "sent_in_para_%", "para_in_section", 
                    "para_in_section_%", "sent_in_subsect", "sent_in_subsect_%", "sent_in_sect", "sent_in_sect_%", 
                    "total_sents", "sent_len", "subheading", "heading", "table", "bullet", "numbered_bullet", 
                    "topic_mentions", "polarity", "subjectivity", "label"]
df = pd.DataFrame(data_for_modeling, columns=column_names)

In [10]:
df.head(1)

Unnamed: 0,sentence,cum_sect,cum_subsect,cum_para,cum_sent,cum_sect_%,cum_subsect_%,cum_para_%,cum_sent_%,subsect_in_sect,...,sent_len,subheading,heading,table,bullet,numbered_bullet,topic_mentions,polarity,subjectivity,label
0,'''!!!''',0,0,0,0,0,0,0,0,0,...,9,0,0,0,0,0,0,0,0,0


In [11]:
# Get X and y
y = df["label"]
X = df.drop(["sentence", "label"], axis=1)

In [12]:
# All individual models tried
# Note: Models evaluate by number of summary sentences output and qualitative assessment of summary quality
models = {
        "linear_SVM": LinearSVC(), # Too slow
        "SVM": SVC(), # Too slow
        "RBF SVM": SVC(), # Too slow
        "KNN4": KNeighborsClassifier(4), # Too slow
        "decision_tree": DecisionTreeClassifier(),  # Decent but not as good as random forest
        "random_forest": RandomForestClassifier(),  # Best
        "gaussianNB": GaussianNB(), # Keeps a lot, not as much of a summary
        "bernoulliNB": BernoulliNB(), # Good amount, but very biased to long sentences with frequent topic meentions
        "logistic": LogisticRegression(), # Too short
        "LDA": LinearDiscriminantAnalysis(), # Too short
        "QDA": QuadraticDiscriminantAnalysis(), # Too long
        "SGD_hinge": SGDClassifier(loss="hinge"), # Too long
        "SGD_huber": SGDClassifier(loss="modified_huber"), # Too short
        "SGD_log": SGDClassifier(loss="log"), # Good summary for some, still long for others
        "SGD_squared_hinge": SGDClassifier(loss="squared_hinge"), # Good for some, but returns no sentences for others
        "SGD_perceptron": SGDClassifier(loss="perceptron"), # Too short
        "adaboost": AdaBoostClassifier(),  # Too slow
        "gradient_boosting": GradientBoostingClassifier()  # Too slow
    }

In [None]:
# Best model
model = RandomForestClassifier(min_samples_leaf=30)  # Min leaf samples limited to 30, to limit model size
model.fit(X, y)

In [15]:
# Predict sentences to include in summary for data set to get sense of length of summaries
predictions = model.predict(X)
sum(predictions)

773072

In [16]:
# English sentences = ~13M; Simple sentences = ~2M
# Note ~2M sentences predicted would still be longer than our desired summary
len(y), sum(y) 

(13158106, 2252795)

In [17]:
# Pack model into a dictionary and pickle
model_pack = {
    "model": model
}

# Save a version as a back-up
pickle_it(model_pack, data_path + "prediction_model.pkl")

# Save a version for use in Flask app
pickle_it(model_pack, "prediction_model.pkl")