In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.neural_network import MLPClassifier

# Read Data
df = pd.read_csv('reddit_train.csv')
df2 = pd.read_csv('reddit_test.csv')

# Pre- processing 
tfidf_vectorizer = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                encoding='utf-8',
                input='content', lowercase=True, max_df=0.1, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=False, stop_words=['english','http','www'], strip_accents=None,
                sublinear_tf=True, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(df['comments'])
tfidf2 = tfidf_vectorizer.transform(df2['comments'])


# Target labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['subreddits'])
y_train = df['label'].to_numpy()

# Feature reduction
select_best = SelectPercentile(chi2, percentile=23)
X_train = select_best.fit_transform(tfidf, y_train)
X_test = select_best.transform(tfidf2)

# Model initilize 
clf = MLPClassifier(hidden_layer_sizes=(95,),alpha=0.00005, batch_size=200, learning_rate='invscaling',
                    learning_rate_init=0.0009, power_t=0.9, max_iter=100, shuffle=True,
                    tol=0.1, validation_fraction=0.8, epsilon=1e-8, n_iter_no_change=2)
# Model fit
clf.fit(X_train,y_train)

# Model predict
predict = clf.predict(X_test)
predicted_label = le.inverse_transform(predict)

# Save prediction
dict_df = {'Id': df2['id'], 'Category': predicted_label}
df_label = pd.DataFrame(dict_df, )
df_label.to_csv('prediction.csv', index=False, header=True, encoding='utf-8')
