### Loading requirements

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import datetime as dt
import csv

from scipy import sparse
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression, RidgeClassifier
from sklearn.metrics import classification_report

---
### Loading dataset + defining functions

In [2]:
science_df = pd.read_csv('datasets\\science_dataset_updated2.csv') #Loads dataset
sports_df = pd.read_csv('datasets\\sports_dataset_updated2.csv') #Loads dataset
gaming_df = pd.read_csv('datasets\\gaming_dataset_updated2.csv') #Loads dataset
wsb_df = pd.read_csv('datasets\\wsb_dataset_updated2.csv') #Loads dataset

indexNames = gaming_df[gaming_df['score'] < 2].index
indexNamessc = science_df[science_df['score'] < 2].index
indexNamessp = sports_df[sports_df['score'] < 2].index

gaming_df_clean = gaming_df.drop(indexNames, axis=0)
science_df_clean = science_df.drop(indexNamessc, axis=0)
sports_df_clean = sports_df.drop(indexNamessp, axis=0)

In [21]:
print('sports', sports_df['title length'].mean())
print('gaming', gaming_df['title length'].mean())
print('science', science_df['title length'].mean())
print('wsb', wsb_df['title length'].mean())

sports 76.95814350797266
gaming 52.2487862551254
science 135.01687956204378
wsb 60.337679269882656


In [None]:
# dummy_flair = pd.get_dummies(df['flair'])
df = science_df
df['24h_posttime'] = df['24h_posttime'].astype(str)
dummy_24h = pd.get_dummies(df['24h_posttime'])

gaming_df_clean = df.drop(['post_ID', 'url', 'author', 'timestamp', 'permalink', 'body', 'Flair', '24h_posttime', 'title length', 'title_cleaned', 'score_class', 'has_body_text', 'comms_num'], axis=1)
science_df_clean = df.drop(['post_ID', 'url', 'author', 'timestamp', 'permalink', 'body', 'Flair', '24h_posttime', 'title length', 'title_cleaned', 'score_class', 'has_body_text', 'comms_num'], axis=1)
sports_df_clean = df.drop(['post_ID', 'url', 'author', 'timestamp', 'permalink', 'body', 'Flair', '24h_posttime', 'title length', 'title_cleaned', 'score_class', 'has_body_text', 'comms_num'], axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))
plot = sns.boxplot(x = gaming_df_clean.score[gaming_df_clean.score < 50], ax=ax)
plot.set_title('Cleaned r/gaming score distribution', fontsize = 18)
plot.tick_params(labelsize=14)
plot.set_xlabel('Score', fontsize = 16)


In [None]:
fig, ax = plt.subplots(figsize=(14, 6))
plot = sns.boxplot(x = gaming_df.score[gaming_df.score < 50], ax=ax)
plot.set_title('original r/gaming score distribution', fontsize = 18)
plot.tick_params(labelsize=14)
plot.set_xlabel('Score', fontsize = 16)


In [None]:
science_df = science_df.drop(['24h_posttime', 'comms_num'], axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(14,10)) 
heatmap  = sns.heatmap(science_df.corr(), ax=ax, annot=True)
heatmap.set_title('Heatmap for: Title length & Body text', fontdict={'fontsize':16}, pad=12)
heatmap.tick_params(labelsize=16)

In [None]:
science_vect = vectorize_dataframe(science_df)
gaming_vect = vectorize_dataframe(gaming_df)
sports_vect = vectorize_dataframe(sports_df)
wsb_vect = vectorize_dataframe(wsb_df)

print(science_vect.shape)
print(gaming_vect.shape)
print(sports_vect.shape)
print(wsb_vect.shape)

In [4]:
stop_words = set(stopwords.words('english')) #Defines stopwords
ps = PorterStemmer() #Defines stemmer

def preprocess_text_col(dataframe, column_name): #Function for preprocessing text data for model-use by adding 'title-cleaned' column to given dataframe
    def remove_punctuation(text): #Removes punctuation from string e.g. 'This is a string. This is another string' → 'this is a string This is another string' 
        no_punct=[words.lower() for words in text if words not in string.punctuation and words.isdigit() == False]
        words_wo_punct=''.join(no_punct)
        return words_wo_punct
    def tokenize(text): #Tokenizes string e.g. 'This is a string' → ['this', 'is', 'a', 'string']
        split=re.split("\W+", text)
        return split
    def remove_stopwords(text): #Removes stopwords list of strings e.g. ['this', 'is', 'a', 'string'] → ['string']
        text=[word for word in text if word not in stop_words and word != '']
        return text
    def stem_nested_list(lst): #Stems words in a nested list and returns a nested list with stemmed words
        master_list = []
        for x in lst:
            stemmed_list = [ps.stem(word) for word in x]
            master_list.append(stemmed_list)
        return master_list
    
    title_wo_punct = [remove_punctuation(x) for x in dataframe[column_name]]
    title_wo_punct_split = [tokenize(word) for word in title_wo_punct]
    title_wo_punct_split_stopwords = [remove_stopwords(word) for word in title_wo_punct_split]
    dataframe['title_cleaned'] = title_wo_punct_split_stopwords
#     dataframe['title_cleaned'] = stem_nested_list(title_wo_punct_split_stopwords)    

def create_features(dataframe):
    
    dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp']) #Changing 'timestamp' column to dtype = datetime
    dataframe['24h_posttime'] = dataframe['timestamp'].dt.hour #Adding hour posttime to dataset
    
    dataframe['title length'] = [len(x) for x in dataframe.title]
    
    dataframe['body'] = dataframe['body'].astype(str)
    dataframe.loc[(dataframe['body'] == 'nan') | (dataframe['body'] == '[deleted]'), 'has_body_text'] = int(0) 
    dataframe.loc[(dataframe['body'] != 'nan') & (dataframe['body'] != '[deleted]'), 'has_body_text'] = int(1)
#     dataframe['has_body_text'] = dataframe['has_body_text'].astype(int)

def create_score_class(dataframe):
    dataframe.reset_index(inplace=True)
    dataframe['over_50'] = 0 #Creating the score_class column in the dataframe and filling it with empty 0s
    for x in range(len(dataframe)): #Generates classes for upvote class
        if dataframe['score'][x] >= 50:
            dataframe['over_50'][x] = 1
        else:
            dataframe['over_50'][x] = 0
    
def test_model(model): #Function for testing model(s)
    if type(model) == list:
        for x in range(len(model)):
            print("Training score for {}: {:.3f}".format(str(model[x]), model[x].score(X_train, y_train)))
            print("Test score for {}: {:.2f}\n".format(str(model[x]), model[x].score(X_test, y_test)))
    else:
        print("Training score for {}: {:.3f}".format(str(model), model.score(X_train, y_train)))
        print("Test score for {}: {:.2f}".format(str(model), model.score(X_test, y_test)))
        
def col_to_matrix(dataframe, column): #Function for converting a column from a pd.dataframe into a scipy.sparse.csr_matrix
    matrix = dataframe[column].values[np.newaxis] #Creating 2D np array from column by adding an axis to original 1D array (df[col].values)
    matrix = matrix.T #Transposing (rotating) array e.g. (1, 823) to (823, 1)
    matrix = sparse.csr_matrix(matrix) #Creating matrix from array
    return matrix

---
### Preprocessing text-data for model-use

In [None]:
begin_time = dt.datetime.now()

vectorizerbow_out = CountVectorizer(lowercase = False, analyzer=lambda x: x)


def vectorize_dataframe(dataframe):
    post_time = col_to_matrix(dataframe, '24h_posttime')
    title_len = col_to_matrix(dataframe, 'title length')
    has_body_text = col_to_matrix(dataframe, 'has_body_text')
    preprocess_text_col(dataframe, 'title')

    vectorizerbow = CountVectorizer(lowercase = False, analyzer=lambda x: x)
    tfidfvectorizer = TfidfVectorizer(lowercase = False, analyzer=lambda x: x)

    vectorizer = vectorizerbow
    titles_vectorized_bow = vectorizer.fit_transform(dataframe['title_cleaned'])

    titles_vectorized_bow = sparse.hstack((post_time, titles_vectorized_bow)) #Adding posttime column to matrix
    titles_vectorized_bow = sparse.hstack((title_len, titles_vectorized_bow)) #Adding title_length column to matrix
    titles_vectorized_bow = sparse.hstack((has_body_text, titles_vectorized_bow)) #Adding has_body_text column to matrix
    return titles_vectorized_bow, vectorizerbow

print('Time spent (hh:mm:ss):', dt.datetime.now() - begin_time)

---
### Creating, training and testing models

In [None]:
sns.histplot(x=[0, 1, 2, 3, 4],  = [wsb_0.score.mean(), wsb_2.score.mean(), wsb_3.score.mean(), wsb_4.score.mean(), wsb_5.score.mean()])

In [None]:
vect_vocab_sorted = dict(sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1])) # Sorting dictionary of word:vector_index by vector_index
titles_vectorized_bow_df = pd.DataFrame(titles_vectorized_bow.toarray()) # Creating DataFrame from vectorized titles
titles_vectorized_bow_df = titles_vectorized_bow_df.drop([0, 1, 2], axis=1) # Removing 'title_length', 'has_body_text' & '24h_posstime' columns

vocab_dict = {}

sum_vect = titles_vectorized_bow_df.sum(axis=0) #dataframe with sums of all vects across all rows

for x in range(len(vect_vocab_sorted)): #Creating a dictionary consisting of word:sum pairs
    vocab_dict[list(vect_vocab_sorted)[x]] = list(sum_vect)[x]

sorted_vocab = dict(sorted(vocab_dict.items(), key=lambda item: item[1], reverse=True)) #Sorting the word:sum pairs by sum
sorted_vocab

In [None]:
preprocess_text_col(gaming_df, 'title')

target = gaming_df.score_class
X_train, X_test, y_train, y_test = train_test_split(vectorize_dataframe(gaming_df), target, test_size = 0.2, random_state = 2021)

In [13]:
begin_time = dt.datetime.now()

df = science_df_clean
results = []
dictio = {}
target = df.over_50

for n in [(1,1), (1,2), (1,3), (2,2), (2,3), (3,3)]:
    vectorizertf = TfidfVectorizer(stop_words = 'english', ngram_range = n)
    vectorizerbow = CountVectorizer(stop_words = 'english', ngram_range = n)
    vect = vectorizerbow.fit_transform(df.title)
    
    X_train, X_test, y_train, y_test = train_test_split(vect, target, test_size = 0.2, random_state = 2021)

    for x in [0.01, 0.1, 1, 5, 10, 20]:
        ridge = RidgeClassifier(alpha = x, max_iter=5000).fit(X_train, y_train)
        print('Results with ngram_range: {} & alpha = {}\nTrain: {}\nTest: {}'.format(n, x, ridge.score(X_train, y_train), ridge.score(X_test, y_test)))
        dictio['model'] = 'ridge'
        dictio['parameter'] = x
        dictio['train'] = ridge.score(X_train, y_train)
        dictio['test'] = ridge.score(X_test, y_test)
        dictio['ngrams'] = n
        dictio_copy = dictio.copy()
        results.append(dictio_copy)

    print('ridge done. Time spent so far... ', dt.datetime.now() - begin_time, '\n')

 
    for x in range(1, 8):
        knn = KNeighborsClassifier(n_neighbors = x, n_jobs=-1).fit(X_train, y_train)
        print('Results with ngram_range: {} & alpha = {}\nTrain: {}\nTest: {}'.format(n, x, knn.score(X_train, y_train), knn.score(X_test, y_test)))
        dictio['model'] = 'knn'
        dictio['parameter'] = x
        dictio['train'] = knn.score(X_train, y_train)
        dictio['test'] = knn.score(X_test, y_test)
        dictio['ngrams'] = n
        dictio_copy = dictio.copy()
        results.append(dictio_copy)

    for x in [20, 50, 70, 100]:
        print('Tree count =', x, '     running...')
        for y in [50, 100, 200, 300]:
            forest = RandomForestClassifier(n_estimators = x, max_depth = y).fit(X_train, y_train)
            dictio['model'] = 'forest'
            dictio['parameter'] = [x, y]
            dictio['train'] = forest.score(X_train, y_train)
            dictio['test'] = forest.score(X_test, y_test)
            dictio['ngrams'] = n
            dictio_copy = dictio.copy()
            results.append(dictio_copy)

    for y in [50, 100, 200, 300]:
        dtree = DecisionTreeClassifier(max_depth = y).fit(X_train, y_train)
        print('Results with ngram_range: {} & alpha = {}\nTrain: {}\nTest: {}'.format(n, y, dtree.score(X_train, y_train), dtree.score(X_test, y_test)))
        dictio['model'] = 'dtree'
        dictio['parameter'] = y
        dictio['train'] = dtree.score(X_train, y_train)
        dictio['test'] = dtree.score(X_test, y_test)
        dictio['ngrams'] = n
        dictio_copy = dictio.copy()
        results.append(dictio_copy)
    print('\n______________________________________________________\n\n\n')


print('\nALL DONE! - Total time spent training/testing (hh:mm:ss):', dt.datetime.now() - begin_time)

Results with ngram_range: (1, 1) & alpha = 0.01
Train: 0.9933611999016474
Test: 0.6184857423795477
Results with ngram_range: (1, 1) & alpha = 0.1
Train: 0.9933611999016474
Test: 0.6125860373647984
Results with ngram_range: (1, 1) & alpha = 1
Train: 0.9921317924760266
Test: 0.6234021632251721
Results with ngram_range: (1, 1) & alpha = 5
Train: 0.9650848291123678
Test: 0.6450344149459194
Results with ngram_range: (1, 1) & alpha = 10
Train: 0.9301696582247356
Test: 0.6529006882989183
Results with ngram_range: (1, 1) & alpha = 20
Train: 0.8861568723875092
Test: 0.6548672566371682
ridge done. Time spent so far...  0:00:00.780934 

Results with ngram_range: (1, 1) & alpha = 1
Train: 0.9933611999016474
Test: 0.6411012782694199
Results with ngram_range: (1, 1) & alpha = 2
Train: 0.7032210474551266
Test: 0.6588003933136677
Results with ngram_range: (1, 1) & alpha = 3
Train: 0.7118268994344726
Test: 0.6578171091445427
Results with ngram_range: (1, 1) & alpha = 4
Train: 0.6727317432997295
Test: 0

KeyboardInterrupt: 

In [None]:
for x in results:
    print(x)

In [None]:
csv_columns = ['model','parameter','train', 'test', 'ngrams']
dict_data = results

#CHANGE NAME BELOW---------------------------------------------------------------------
csv_file = "datasets\\model_results\\aggregate.csv"
#CHANGE NAME ABOVE---------------------------------------------------------------------
with open(csv_file, 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
    writer.writeheader()
    for data in dict_data:
        writer.writerow(data)

In [None]:
df = gaming_df
target = df.score_class

vectorizerbow = CountVectorizer(stop_words = 'english', ngram_range = n)
vect = vectorizerbow.fit_transform(df.title)

X_train, X_test, y_train, y_test = train_test_split(vect, target, test_size = 0.2, random_state = 2021)

ridge = RidgeClassifier(alpha = 1, max_iter=5000).fit(X_train, y_train)

In [None]:
dataframe2 = science_df
science_vect = vector.transform(dataframe2.title_cleaned)

post_time = col_to_matrix(dataframe2, '24h_posttime')
title_len = col_to_matrix(dataframe2, 'title length')
has_body_text = col_to_matrix(dataframe2, 'has_body_text')

science_vect = sparse.hstack((post_time, science_vect)) #Adding posttime column to matrix
science_vect = sparse.hstack((title_len, science_vect)) #Adding title_length column to matrix
science_vect = sparse.hstack((has_body_text, science_vect)) #Adding has_body_text column to matrix

print(science_vect.shape)



dataframe2 = gaming_df
gaming_vect = vector.transform(dataframe2.title_cleaned)

post_time = col_to_matrix(dataframe2, '24h_posttime')
title_len = col_to_matrix(dataframe2, 'title length')
has_body_text = col_to_matrix(dataframe2, 'has_body_text')

gaming_vect = sparse.hstack((post_time, gaming_vect)) #Adding posttime column to matrix
gaming_vect = sparse.hstack((title_len, gaming_vect)) #Adding title_length column to matrix
gaming_vect = sparse.hstack((has_body_text, gaming_vect)) #Adding has_body_text column to matrix

print(gaming_vect.shape)


dataframe2 = sports_df
sports_vect = vector.transform(dataframe2.title_cleaned)

post_time = col_to_matrix(dataframe2, '24h_posttime')
title_len = col_to_matrix(dataframe2, 'title length')
has_body_text = col_to_matrix(dataframe2, 'has_body_text')

sports_vect = sparse.hstack((post_time, sports_vect)) #Adding posttime column to matrix
sports_vect = sparse.hstack((title_len, sports_vect)) #Adding title_length column to matrix
sports_vect = sparse.hstack((has_body_text, sports_vect)) #Adding has_body_text column to matrix

print(sports_vect.shape)

## FEATURE IMPORTANCES

In [None]:
for key, value in vectorizer.vocabulary_.items():
    if value == 5069:
        print(key)

In [None]:
# for key, value in vectorizerbow_out.vocabulary_.items():
#     if value == coef.iloc[x].idxmin():
#         word = key
#     print(coef.iloc[x].loc[coef.iloc[x].idxmin()], word)

# print()
# for x in range(5):
#     for key, value in vectorizerbow_out.vocabulary_.items():
#         if value == coef.iloc[x].idxmax():
#             word = key
#     print(index[x], coef.iloc[x].loc[coef.iloc[x].idxmax()], word)
# coef.iloc[4].loc[19272]
coef = pd.DataFrame(ridge.coef_).T
coef.rename(columns=({0:'one'}), inplace=True)

In [None]:
vect_vocab_sorted = dict(sorted(vectorizerbow_out.vocabulary_.items(), key=lambda item: item[1])) # Sorting dictionary of word:vector_index by vector_index

vocab_dict = {}

for x in range(len(vect_vocab_sorted)): #Creating a dictionary consisting of word:sum pairs
    print(x)
    vocab_dict[list(vect_vocab_sorted)[x]] = forest.feature_importances_[x+3]

sorted_vocab = dict(sorted(vocab_dict.items(), key=lambda item: item[1], reverse=True)) #Sorting the word:sum pairs by sum

In [None]:
list(forest.feature_importances_)

In [None]:
sorted_vocab_items = sorted_vocab.items()
cols = ['has_body_text', 'title_length', '24h_posttime']
for x in range(3):
    print(cols[x], ':', forest.feature_importances_[x])
list(sorted_vocab_items)[:10]

In [None]:
forest.feature_importances_[3]

# TEST BELOW

In [1]:
test_data = {'title':['Looking forward to the amazing match between the suns and lakers tonight! Lebron will without a doubt be an issue for suns, but never dismiss the sheer force that is Deandre Ayton!!! GO SUNS', 
                     'Im going to sleep soon. see you tommorow yall',
                      'So ill just write a reaaaaally long text for this title and see how it performs. Maybe long titles just give better results? who am i to say. This wont include any body text either..', 
                      'Dodging a cash-in-transit robbery. The man has balls of steel'], 
            'body':['https://www.nba.com/schedule', 'Arent your sleepy my guy?', 'nan', 'https://www.youtube.com/watch?v=hruqhM2Axnc'],
            'timestamp': ['2021-05-26 21:03:19', '2021-05-26 02:03:19', '2021-05-26 18:03:19', '2021-05-26 20:03:19']}
test_df = pd.DataFrame(test_data, columns=['title', 'body', 'timestamp'])
create_features(test_df)
preprocess_text_col(test_df, 'title')
print(test_df.head())
test_vect = vector.transform(test_df.title_cleaned)

post_time = col_to_matrix(test_df, '24h_posttime')
title_len = col_to_matrix(test_df, 'title length')
has_body_text = col_to_matrix(test_df, 'has_body_text')

test_vect = sparse.hstack((post_time, test_vect)) #Adding posttime column to matrix
test_vect = sparse.hstack((title_len, test_vect)) #Adding title_length column to matrix
test_vect = sparse.hstack((has_body_text, test_vect)) #Adding has_body_text column to matrix

ridge.predict(test_vect)

NameError: name 'pd' is not defined

In [None]:
for k, v in results.items():
    print(k,v)

In [None]:
a_file = open("datasets\\\model_results\\unigram_master.csv", "w")

writer = csv.writer(a_file)
for k, v in results.items():
    writer.writerow([k, v])

a_file.close()