### Loading requirements

In [29]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [1]:
import pandas as pd
import numpy as np
import re
import string
import datetime as dt
from scipy import sparse
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression, RidgeClassifier
from sklearn.metrics import classification_report

---
### Loading dataset + defining functions

In [2]:
science_df = pd.read_csv('datasets\\science_dataset_updated.csv',) #Loads dataset

stop_words = set(stopwords.words('english')) #Defines stopwords
ps = PorterStemmer() #Defines stemmer

def preprocess_text_col(dataframe, column_name): #Function for preprocessing text data for model-use by adding 'title-cleaned' column to given dataframe
    
    def remove_punctuation(text): #Removes punctuation from string e.g. 'This is a string. This is another string' → 'this is a string This is another string' 
        no_punct=[words.lower() for words in text if words not in string.punctuation and words.isdigit() == False]
        words_wo_punct=''.join(no_punct)
        return words_wo_punct
    def tokenize(text): #Tokenizes string e.g. 'This is a string' → ['this', 'is', 'a', 'string']
        split=re.split("\W+", text) 
        return split
    def remove_stopwords(text): #Removes stopwords list of strings e.g. ['this', 'is', 'a', 'string'] → ['string']
        text=[word for word in text if word not in stop_words]
        return text
    def stem_nested_list(lst): #Stems words in a nested list and returns a nested list with stemmed words
        master_list = []
        for x in lst:
            stemmed_list = [ps.stem(word) for word in x]
            master_list.append(stemmed_list)
        return master_list
    
    title_wo_punct = [remove_punctuation(x) for x in dataframe[column_name]]
    title_wo_punct_split = [tokenize(word) for word in title_wo_punct]
    title_wo_punct_split_stopwords = [remove_stopwords(word) for word in title_wo_punct_split]
    dataframe['title_cleaned'] = title_wo_punct_split_stopwords
#     dataframe['title_cleaned'] = stem_nested_list(title_wo_punct_split_stopwords)    

def create_features(dataframe):
    dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp']) #Changing 'timestamp' column to dtype = datetime
    dataframe['24h_posttime'] = dataframe['timestamp'].dt.hour #Adding hour posttime to dataset
    
    dataframe['score_class'] = "" #Creating the score_class column in the dataframe and filling it with empty strings
    
    dataframe['body'] = dataframe['body'].astype(str)
    dataframe.loc[(science_df['body'] == 'nan') | (dataframe['body'] == '[deleted]'), 'has_body_text'] = int(0) 
    dataframe.loc[(science_df['body'] != 'nan') & (dataframe['body'] != '[deleted]'), 'has_body_text'] = int(1)
    dataframe['has_body_text'] = dataframe['has_body_text'].astype(int)
    
    for x in range(len(dataframe)): #Generates classes for score percentiles
        if dataframe['score'][x] >= dataframe.score.quantile(0.99):
            dataframe['score_class'][x] = 'Top 1%'
        elif dataframe['score'][x] >= dataframe.score.quantile(0.95):
            dataframe['score_class'][x] = 'Top 5%'
        elif dataframe['score'][x] >= dataframe.score.quantile(0.9):
            dataframe['score_class'][x] = 'Top 10%'
        elif dataframe['score'][x] >= dataframe.score.quantile(0.8):
            dataframe['score_class'][x] = 'Top 20%'
        else:
            dataframe['score_class'][x] = 'Last 80%'
    
def test_model(model): #Function for testing model(s)
    if type(model) == list:
        for x in range(len(model)):
            print("Training score for {}: {:.3f}".format(str(model[x]), model[x].score(X_train, y_train)))
            print("Test score for {}: {:.2f}\n".format(str(model[x]), model[x].score(X_test, y_test)))
    else:
        print("Training score for {}: {:.3f}".format(str(model), model.score(X_train, y_train)))
        print("Test score for {}: {:.2f}".format(str(model), model.score(X_test, y_test)))
        
def col_to_matrix(dataframe, column): #Function for converting a column from a pd.dataframe into a scipy.sparse.csr_matrix
    matrix = dataframe[column].values[np.newaxis] #Creating 2D np array from column by adding an axis to original 1D array (df[col].values)
    matrix = matrix.T #Transposing (rotating) array e.g. (1, 823) to (823, 1)
    matrix = sparse.csr_matrix(matrix) #Creating matrix from array
    return matrix

---
### Preprocessing text-data for model-use

In [3]:
create_features(science_df)
science_df.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['score_class'][x] = 'Last 80%'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['score_class'][x] = 'Top 1%'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['score_class'][x] = 'Top 20%'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['score_class'][x] = 'Top 10%'
A value 

Index(['post_ID', 'title', 'url', 'author', 'timestamp', 'comms_num',
       'permalink', 'body', 'Flair', 'title length', '24h_posttime', 'score',
       'score_class', 'has_body_text'],
      dtype='object')

In [33]:
begin_time = dt.datetime.now()

preprocess_text_col(science_df, 'title')
score_list = sorted([x for x in science_df.score]) #Generates a list of all scores sorted in ascending order

post_time = col_to_matrix(science_df, '24h_posttime')
title_len = col_to_matrix(science_df, 'title length')
has_body_text = col_to_matrix(science_df, 'has_body_text')

vectorizer = CountVectorizer(lowercase=False, analyzer=lambda x: x)
titles_vectorized = vectorizer.fit_transform(science_df.title_cleaned)
titles_vectorized = sparse.hstack((post_time, titles_vectorized)) #Adding posttime column to matrix
titles_vectorized = sparse.hstack((title_len, titles_vectorized)) #Adding title_length column to matrix
titles_vectorized = sparse.hstack((has_body_text, titles_vectorized)) #Adding has_body_text column to matrix

X_train, X_test, y_train, y_test = train_test_split(titles_vectorized, science_df.score_class, test_size = 0.2, random_state = 2021)

print('Time spent (hh:mm:ss):', dt.datetime.now() - begin_time)

Time spent (hh:mm:ss): 0:00:00.579284


---
### Creating, training and testing models (X_train2 = regression)

In [44]:
begin_time = dt.datetime.now()

for x in [0.01, 0.1, 1]:
    ridge = RidgeClassifier(alpha = x, max_iter=5000).fit(X_train, y_train)
    SVC_model = SVC(C = x).fit(X_train, y_train)
    test_model(ridge)

for x in [1, 3, 5]:
    SVC_model = SVC(C = x).fit(X_train, y_train)
    test_model(SVC_model)
    
for x in [1, 2, 3, 4, 5]:
    knn = KNeighborsClassifier(n_neighbors = x).fit(X_train, y_train)
    test_model(knn)
    
for x in [20, 50, 70, 100]:
    for y in [50, 100, 200, 300]:
        forest = RandomForestClassifier(n_estimators = x, max_depth = y).fit(X_train, y_train)
        dtree = DecisionTreeClassifier(max_depth = y).fit(X_train, y_train)
        test_model([forest, dtree])
        
train_end_time = dt.datetime.now()
print('Time spent training (hh:mm:ss):', train_end_time - begin_time)

Training score for RidgeClassifier(alpha=0.01, max_iter=5000): 0.993
Test score for RidgeClassifier(alpha=0.01, max_iter=5000): 0.67
Training score for RidgeClassifier(alpha=0.1, max_iter=5000): 0.992
Test score for RidgeClassifier(alpha=0.1, max_iter=5000): 0.70
Training score for RidgeClassifier(alpha=1, max_iter=5000): 0.989
Test score for RidgeClassifier(alpha=1, max_iter=5000): 0.73
Training score for SVC(C=1): 0.802
Test score for SVC(C=1): 0.79
Training score for SVC(C=3): 0.802
Test score for SVC(C=3): 0.79
Training score for SVC(C=5): 0.802
Test score for SVC(C=5): 0.79
Training score for KNeighborsClassifier(n_neighbors=1): 0.999
Test score for KNeighborsClassifier(n_neighbors=1): 0.68
Training score for KNeighborsClassifier(n_neighbors=2): 0.845
Test score for KNeighborsClassifier(n_neighbors=2): 0.76
Training score for KNeighborsClassifier(n_neighbors=3): 0.832
Test score for KNeighborsClassifier(n_neighbors=3): 0.76
Training score for KNeighborsClassifier(n_neighbors=4): 0

In [25]:
begin_time = dt.datetime.now()

forest = RandomForestClassifier(n_estimators = 100).fit(X_train, y_train)
test_model(forest)

train_end_time = dt.datetime.now()
print('Time spent training (hh:mm:ss):',  train_end_time - begin_time)

MemoryError: could not allocate 41877504 bytes

In [5]:
feature_names_total = vectorizer.get_feature_names()
feature_names_total.extend(['posttime', 'title_length'])
sorted_coefs_desc = sorted(list(zip(list(feature_names_total), logreg.coef_[0])), key = lambda e: e[1], reverse=True)
sorted_coefs_asc = sorted(list(zip(list(feature_names_total), logreg.coef_[0])), key = lambda e: e[1])
features_forest = sorted(list(zip(list(feature_names_total), forest.feature_importances_)), key = lambda e: e[1], reverse=True)
# print(features_forest[:10], '\n')

for x in range(10):
    print(sorted_coefs_desc[x])

print('\n')
for x in range(10):
    
    print(sorted_coefs_asc[x])
for x in range(len(sorted_coefs_desc)):
    if sorted_coefs_desc[x][0] == 'posttime' :
        print(sorted_coefs_desc[x])
    else:
        pass
for x in range(len(sorted_coefs_desc)):
    if sorted_coefs_desc[x][0] == 'title_length' :
        print(sorted_coefs_desc[x])
    else:
        pass

('aaai', 0.43624606175774516)
('visualization', 0.3223039299815076)
('agedependent', 0.3194857417795549)
('ampamp', 0.2989954111354649)
('aggravates', 0.29860082349455064)
('neuropathic', 0.2947864426034513)
('painfree', 0.28919528208244805)
('genebased', 0.2883023619813986)
('urbanization', 0.2733032167796802)
('movement', 0.27223045102284404)


('finelycrushed', -0.4696689965724894)
('viii', -0.43792346714062)
('yeast', -0.3933578605576364)
('ancienttimes', -0.38286713799763833)
('tetrahydrocannabinol', -0.37844303043497324)
('autismrelated', -0.3699753451279366)
('heights', -0.35714510334625044)
('mitigates', -0.3540053596705342)
('halffemale', -0.34956851962079716)
('thousand', -0.33847137317057907)
('posttime', -0.053531051994896224)
('title_length', 0.009213906232209745)


In [None]:
science_df['24h_posttime'].value_counts()
science_df['body'].value_counts()