In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

pd.options.mode.chained_assignment = None

In [2]:
train = pd.read_csv('data/train.csv')    # reading the data

In [3]:
# check the data

train.head(10)

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes
0,52664,a,3942.0,2.0,155623,7855.0,42.0
1,327662,a,26046.0,12.0,21781,55801.0,1175.0
2,468453,c,1358.0,4.0,56177,8067.0,60.0
3,96996,a,264.0,3.0,168793,27064.0,9.0
4,131465,c,4271.0,4.0,112223,13986.0,83.0
5,345568,r,440.0,1.0,23866,5577.0,4.0
6,376865,c,147.0,1.0,10759,17184.0,17.0
7,467378,j,2269.0,2.0,54623,312.0,3.0
8,393640,j,111.0,2.0,172926,53738.0,79.0
9,130054,c,2749.0,1.0,10189,227.0,0.0


In [4]:
train.drop(['ID', 'Username'], axis=1, inplace=True)   # dropping the unnecessary columns

In [5]:
# Pre-processing the data

# convert the string to numbers
tags = set(train['Tag'])

train['Tag'] = train['Tag'].replace(to_replace=tags, value=range(len(tags)))

In [6]:
# Scaling the data using the (Max-Min)/Max method

scaler = preprocessing.StandardScaler()

x = train.values    # converting to numpy array
scaled = scaler.fit_transform(x)
train = pd.DataFrame(scaled, columns=train.columns)  # reverting to DataFrame

print(train.head(10))

        Tag  Reputation   Answers     Views   Upvotes
0  1.746117   -0.141573 -0.535736 -0.269158 -0.082258
1  1.746117    0.675238  2.257943  0.323087  0.233127
2 -0.832059   -0.237059  0.023000 -0.266540 -0.077247
3  1.746117   -0.277486 -0.256368 -0.031882 -0.091444
4 -0.832059   -0.129415  0.023000 -0.193426 -0.070845
5  0.641184   -0.270982 -0.815104 -0.297297 -0.092835
6 -0.832059   -0.281809 -0.815104 -0.153923 -0.089217
7  0.272873   -0.203395 -0.535736 -0.362332 -0.093114
8  0.272873   -0.283140 -0.535736  0.297604 -0.071958
9 -0.832059   -0.185658 -0.815104 -0.363382 -0.093949


In [7]:
# Spiltting the data to check the model

train, val = model_selection.train_test_split(train, train_size=0.8, test_size=0.2)

In [8]:
# separating the independent and dependent variables

x_train = train[['Tag', 'Reputation', 'Answers', 'Views']]
x_val = val[['Tag', 'Reputation', 'Answers', 'Views']]

y_train = train['Upvotes']
y_val = val['Upvotes']

In [9]:
# Now, checking the polynomiality 

poly = preprocessing.PolynomialFeatures(degree=3)      # since there are 4 independent variables

x_train_poly = poly.fit_transform(x_train)        # changing the form of x variables
x_val_poly = poly.fit_transform(x_val)

In [10]:
model = linear_model.LinearRegression()            # creating the model and predicting on the 'val' set
model.fit(x_train_poly, y_train)

y_pred = model.predict(x_val_poly)

# The mean squared error
print('Mean-Sqaured score: %.8f' % mean_squared_error(y_val, y_pred))
print('R2 score: %.2f' % r2_score(y_val, y_pred))

Mean-Sqaured score: 0.08178130
R2 score: 0.91


Now, Using this model to predict the test set and submit the csv file

In [11]:
test = pd.read_csv('data/test.csv')
test_ID = test['ID']

x_test = test.drop(['ID', 'Username'], axis=1)
x_test['Tag'] = x_test['Tag'].replace(to_replace=tags, value=range(len(tags)))

In [12]:
# Scaling the data using the (Max-Min)/Max method

x = x_test.values    # converting to numpy array
scaled = scaler.fit_transform(x)
x_test = pd.DataFrame(scaled, columns=x_test.columns)  # reverting to DataFrame

print(x_test.head(10))

        Tag  Reputation   Answers     Views
0  1.741412   -0.081543 -0.255734  0.041742
1 -0.833141    0.594400  0.582853 -0.337505
2  1.373619   -0.250583 -0.814791 -0.108028
3  1.373619   -0.283044  0.582853 -0.140874
4  1.373619   -0.123463  1.700969  0.340957
5 -0.833141   -0.167281 -0.814791 -0.342770
6 -0.097554   -0.253019 -0.814791 -0.290495
7 -0.833141   -0.256208  0.023795  0.347616
8 -1.568728    0.028880 -0.535262 -0.315985
9 -0.465348   -0.277705  0.023795 -0.310509


In [13]:
# Predicting the upvotes

x_test_poly = poly.fit_transform(x_test)

y_pred = model.predict(x_test_poly)

In [14]:
submission = pd.DataFrame(y_pred, index=test_ID)

submission.to_csv(path_or_buf='submission.csv', sep=',', index=True, mode='w+', doublequote=False)