In [25]:
import pandas as pd

from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
# from sklearn.feature_extraction.text import CountVectorizer

In [26]:
train_data = pd.read_csv('data/train.csv')
test_set = train_data.dropna() # Drop any rows with anything missing. Not too many of them anyway.

In [27]:
"""

Model with using empirically guessed parameters and methods
no stopwords,
tfidf as vectoriser,
min_df = 5,
ngram_range = (1,2)


Found sources and examples online with more advanced techniques like stemming / lemmatisation on 
this particular dataset but decided against them since they dont improve accuracy by much.

It would make sense to add the subreddit as one-hotted features, weekend as a binary numerical feature, 
along with score, upvotes and downvotes as continuous numerical features but dont have enough time

"""

vect = TfidfVectorizer(min_df=5,ngram_range=(1,2))
vect.fit(test_set['comment'])


X_tr, X_tst, y_train, y_test = train_test_split(test_set['comment'],test_set['label'])

X_train = vect.transform(X_tr)
X_test = vect.transform(X_tst)

feature_names = vect.get_feature_names()
print(feature_names[::2000])
print(len(feature_names))

clf = LogisticRegression()
clf.fit(X_train,y_train)


score = clf.score(X_test,y_test)
print("score: ", score)
print(X_train.shape)

['00', '5d', 'actually less', 'all about', 'am proud', 'and brian', 'and pillow', 'any', 'are personally', 'asia and', 'babies we', 'be drawn', 'because your', 'bettas', 'bogan', 'buddy pal', 'by modern', 'card art', 'check yourself', 'clothes for', 'condolences on', 'couldn stay', 'cyrillic', 'definition of', 'different job', 'dollar tree', 'dsr', 'embody', 'eventually they', 'f6', 'fever is', 'folded', 'forced out', 'fuck yo', 'generalising', 'go badly', 'gowdy', 'had different', 'hates everyone', 'he ruins', 'hide then', 'holiday spirit', 'iconic', 'in even', 'init', 'is did', 'iso', 'its shitty', 'just last', 'kiting', 'learn your', 'like holding', 'lol great', 'main character', 'matt cassel', 'met his', 'money would', 'muddled', 'names that', 'nice stuff', 'not half', 'obvs', 'of personal', 'olympian', 'only end', 'orioles', 'page says', 'people react', 'planted by', 'positives', 'probably nothing', 'quad', 're oppressing', 'reddits', 'reveals the', 'running off', 'school days', '



score:  0.7234824667472793
(682272, 215441)


In [28]:
# Assuming the model is stable. 
# That is reasonable since we didnt have notable spikes or drops
# in accuracy during many random samplings building train/test sets.
# We can now train on the full data set and go for validation

X = vect.transform(test_set['comment'])
print(X.shape)
y = test_set['label']

feature_names = vect.get_feature_names()
print(feature_names[::2000])
print(len(feature_names))

clf = LogisticRegression()
clf.fit(X,y)

(909697, 215441)
['00', '5d', 'actually less', 'all about', 'am proud', 'and brian', 'and pillow', 'any', 'are personally', 'asia and', 'babies we', 'be drawn', 'because your', 'bettas', 'bogan', 'buddy pal', 'by modern', 'card art', 'check yourself', 'clothes for', 'condolences on', 'couldn stay', 'cyrillic', 'definition of', 'different job', 'dollar tree', 'dsr', 'embody', 'eventually they', 'f6', 'fever is', 'folded', 'forced out', 'fuck yo', 'generalising', 'go badly', 'gowdy', 'had different', 'hates everyone', 'he ruins', 'hide then', 'holiday spirit', 'iconic', 'in even', 'init', 'is did', 'iso', 'its shitty', 'just last', 'kiting', 'learn your', 'like holding', 'lol great', 'main character', 'matt cassel', 'met his', 'money would', 'muddled', 'names that', 'nice stuff', 'not half', 'obvs', 'of personal', 'olympian', 'only end', 'orioles', 'page says', 'people react', 'planted by', 'positives', 'probably nothing', 'quad', 're oppressing', 'reddits', 'reveals the', 'running off',

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
# Now to get the list of id - label from the test.csv
validation_set = pd.read_csv('data/test.csv').dropna()
validation_transformed = vect.transform(validation_set['comment'])
validation_set['predicted_label'] = clf.predict(validation_transformed)

In [30]:
# Filter columns, create the resulting csv
validation_set = validation_set.filter(['predicted_label'])
validation_set.to_csv('results.csv')