In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

pd.options.mode.chained_assignment = None

In [2]:
train = pd.read_csv('data/train.csv')    # reading the data

In [3]:
# check the data

train.head(10)

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes
0,52664,a,3942.0,2.0,155623,7855.0,42.0
1,327662,a,26046.0,12.0,21781,55801.0,1175.0
2,468453,c,1358.0,4.0,56177,8067.0,60.0
3,96996,a,264.0,3.0,168793,27064.0,9.0
4,131465,c,4271.0,4.0,112223,13986.0,83.0
5,345568,r,440.0,1.0,23866,5577.0,4.0
6,376865,c,147.0,1.0,10759,17184.0,17.0
7,467378,j,2269.0,2.0,54623,312.0,3.0
8,393640,j,111.0,2.0,172926,53738.0,79.0
9,130054,c,2749.0,1.0,10189,227.0,0.0


In [4]:
train.drop(['ID', 'Username'], axis=1, inplace=True)   # dropping the unnecessary columns

In [5]:
# Pre-processing the data

# convert the string to numbers
tags = set(train['Tag'])

train['Tag'] = train['Tag'].replace(to_replace=tags, value=range(len(tags)))

In [6]:
# Scaling the data using the (Max-Min)/Max method

scaler = preprocessing.MinMaxScaler()

x = train.values    # converting to numpy array
scaled = scaler.fit_transform(x)
train = pd.DataFrame(scaled, columns=train.columns)  # reverting to DataFrame

print(train.head(10))

        Tag  Reputation   Answers     Views   Upvotes
0  0.555556    0.003782  0.026316  0.001500  0.000068
1  0.555556    0.024986  0.157895  0.010666  0.001910
2  0.777778    0.001303  0.052632  0.001540  0.000098
3  0.555556    0.000253  0.039474  0.005172  0.000015
4  0.777778    0.004097  0.052632  0.002672  0.000135
5  0.111111    0.000422  0.013158  0.001064  0.000007
6  0.777778    0.000141  0.013158  0.003283  0.000028
7  0.000000    0.002177  0.026316  0.000058  0.000005
8  0.000000    0.000106  0.026316  0.010271  0.000128
9  0.777778    0.002637  0.013158  0.000042  0.000000


In [7]:
# Spiltting the data to check the model

train, val = model_selection.train_test_split(train, train_size=0.8, test_size=0.2)

In [8]:
# separating the independent and dependent variables

x_train = train[['Tag', 'Reputation', 'Answers', 'Views']]
x_val = val[['Tag', 'Reputation', 'Answers', 'Views']]

y_train = train['Upvotes']
y_val = val['Upvotes']

In [9]:
# Now, checking the polynomiality 

poly = preprocessing.PolynomialFeatures(degree=4)      # since there are 4 independent variables

x_train_poly = poly.fit_transform(x_train)        # changing the form of x variables
x_val_poly = poly.fit_transform(x_val)

In [10]:
model = linear_model.LinearRegression()            # creating the model and predicting on the 'val' set
model.fit(x_train_poly, y_train)

y_pred = model.predict(x_val_poly)

# The mean squared error
print('Mean-Sqaured score: %.8f' % mean_squared_error(y_val, y_pred))
print('R2 score: %.2f' % r2_score(y_val, y_pred))

Mean-Sqaured score: 0.00000257
R2 score: 0.92


Now, Using this model to predict the test set and submit the csv file

In [11]:
test = pd.read_csv('data/test.csv')
test_ID = test['ID']

x_test = test.drop(['ID', 'Username'], axis=1)
x_test['Tag'] = x_test['Tag'].replace(to_replace=tags, value=range(len(tags)))

x_test_poly = poly.fit_transform(x_test)

y_pred = model.predict(x_test_poly)

In [24]:
submission = pd.DataFrame(y_pred, index=test_ID)

submission.to_csv(path_or_buf='submission.csv', sep=',', index=True, mode='w+', doublequote=False)