### Loading requirements

In [42]:
import pandas as pd
import numpy as np
import re
import string
import datetime as dt
from scipy import sparse
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression
from sklearn.metrics import classification_report

---
### Loading dataset + defining functions

In [85]:
science_df = pd.read_csv('datasets\\science_dataset_updated.csv',) #Loads dataset
stop_words = set(stopwords.words('english')) #Defines stopwords
ps = PorterStemmer() #Defines stemmer

def preprocess_text_col(dataframe, column_name): #Function for preprocessing text data for model-use by adding 'title-cleaned' column to given dataframe
    
    def remove_punctuation(text): #Removes punctuation from string e.g. 'This is a string. This is another string' → 'This is a string This is another string' 
        no_punct=[words for words in text if words not in string.punctuation]
        words_wo_punct=''.join(no_punct)
        return words_wo_punct
    def tokenize(text): #Tokenizes string e.g. 'This is a string' → ['this', 'is', 'a', 'string']
        split=re.split("\W+", text) 
        return split
    def remove_stopwords(text): #Removes stopwords list of strings e.g. ['this', 'is', 'a', 'string'] → ['string']
        text=[word for word in text if word not in stop_words]
        return text
    def stem_nested_list(lst): #Stems words in a nested list and returns a nested list with stemmed words
        master_list = []
        for x in lst:
            stemmed_list = [ps.stem(word) for word in x]
            master_list.append(stemmed_list)
        return master_list
    
    title_wo_punct = [remove_punctuation(x) for x in dataframe[column_name]]
    title_wo_punct_split = [tokenize(word) for word in title_wo_punct]
    title_wo_punct_split_stopwords = [remove_stopwords(word) for word in title_wo_punct_split]
    dataframe['title_cleaned'] = stem_nested_list(title_wo_punct_split_stopwords)

    
    
def test_model(model): #Function for testing model(s)
    if type(model) == list:
        for x in range(len(model)):
            print("Training score for {}: {:.3f}".format(str(model[x]), model[x].score(X_train, y_train)))
            print("Test score for {}: {:.2f}\n".format(model[x], model[x].score(X_test, y_test)))
    else:
        print("Training score: {:.3f}".format(model.score(X_train, y_train)))
        print("Test score: {:.2f}".format(model.score(X_test, y_test)))

def col_to_matrix(dataframe, column): #Function for converting a column from a pd.dataframe into a scipy.sparse.csr_matrix
    matrix =  dataframe[column].values[np.newaxis] #Creating 2D np array from column by adding an axis to original 1D array (df[col].values)
    matrix = matrix.T #Transposing (rotating) array e.g. (1, 823) to (823, 1)
    matrix = sparse.csr_matrix(matrix) #Creating matrix from array
    return matrix

---
### Preprocessing text-data for model-use

In [86]:
preprocess_text_col(science_df, 'title')

science_df['score_class'] = "" #Creating the score_class column in the dataframe and filling it with 0s
score_list = sorted([x for x in science_df.score]) #Generates a list of all scores sorted in ascending order

for x in range(len(science_df)): #Generates classes for score percentiles
    if science_df['score'][x] >= score_list[int(len(science_df['score']) * 0.99)]:
        science_df['score_class'][x] = 'Top 1%'
    elif science_df['score'][x] >= score_list[int(len(science_df['score']) * 0.95)]:
        science_df['score_class'][x] = 'Top 5%'
    elif science_df['score'][x] >= score_list[int(len(science_df['score']) * 0.90)]:
        science_df['score_class'][x] = 'Top 10%'
    elif science_df['score'][x] >= score_list[int(len(science_df['score']) * 0.80)]:
        science_df['score_class'][x] = 'Top 20%'
    else:
        science_df['score_class'][x] = 'Last 80%'

post_time = col_to_matrix(science_df, '24h_posttime')
title_len = col_to_matrix(science_df, 'title length')

vectorizer = CountVectorizer(lowercase=False, analyzer=lambda x: x)
titles_vectorized = vectorizer.fit_transform(science_df.title_cleaned)
titles_vectorized = sparse.hstack((post_time, titles_vectorized)) #Adding posttime column to matrix
titles_vectorized = sparse.hstack((title_len, titles_vectorized)) #Adding title_length column to matrix

X_train, X_test, y_train, y_test = train_test_split(titles_vectorized, science_df.score_class, test_size = 0.2, random_state = 2021)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  science_df['score_class'][x] = 'Last 80%'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  science_df['score_class'][x] = 'Top 1%'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  science_df['score_class'][x] = 'Top 20%'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  science_df['score_class'][x] = 'Top 10%'
A va

---
### Creating, training and testing models

In [88]:
begin_time = dt.datetime.now()

logreg = LogisticRegression(C=.1, max_iter=5000).fit(X_train, y_train)
SVC_model = SVC(C=.1).fit(X_train, y_train)
knn = KNeighborsClassifier(n_neighbors = 5).fit(X_train, y_train)
dtree = DecisionTreeClassifier().fit(X_train, y_train)
forest = RandomForestClassifier().fit(X_train, y_train)

train_end_time = dt.datetime.now()
print('TRAINING FINISHED!\n'+'Time spent training (hh:mm:ss):', train_end_time - begin_time, '\n')

test_model([logreg, SVC_model, knn, dtree, forest])

print('Time spent testing (hh:mm:ss):', dt.datetime.now() - train_end_time)

TRAINING FINISHED!
Time spent training (hh:mm:ss): 0:00:37.380542 

Training score for LogisticRegression(C=0.1, max_iter=5000): 0.863
Test score for LogisticRegression(C=0.1, max_iter=5000): 0.77

Training score for SVC(C=0.1): 0.802
Test score for SVC(C=0.1): 0.79

Training score for KNeighborsClassifier(): 0.815
Test score for KNeighborsClassifier(): 0.78

Training score for DecisionTreeClassifier(): 0.998
Test score for DecisionTreeClassifier(): 0.70

Training score for RandomForestClassifier(): 0.998
Test score for RandomForestClassifier(): 0.78

Time spent testing (hh:mm:ss): 0:00:06.259303
