# Basic score prediction model 

In [2]:
# -*- coding: utf-8 -*-

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

## load data

In [3]:
path = 'comments.csv'

In [4]:
data = pd.read_csv(path)

In [5]:
print ("The original number of data: ")
print (len(data))

The original number of data: 
2868


# data cleaning 

In [6]:
# remove none value 
new_data = data[np.isfinite(data['mean_evaluation'])]

In [7]:
print ("After removing the nan evaluation value, the number of data: ")
print (len(new_data))

After removing the nan evaluation value, the number of data: 
2562


In [8]:
print ("The number of nan evualuation items:")
print (len(data) - len(new_data))

The number of nan evualuation items:
306


In [9]:
comments_texts = new_data['comment_text']
X = new_data['comment_text']
y = new_data['mean_evaluation']

In [10]:
print (comments_texts[1])

The main reason why I agree that negative gearing should be abolished is that it is a major tax break which is largely only available to the wealthy (as unfortunately most tax breaks are). I see no good reason why we should essentially subsidise the property speculations of the wealthy. The consideration that, by driving up housing prices, it makes buying a house that much more unaffordable for first home buyers also seems to be of some importance, as once again this seems to disadvantage those just starting our in life in favour of those with already established wealth.
I am however somewhat worried that simply abolishing negative gearing would cause significant economic upheaval and drive up rental prices. It seems to me that some sort of gradual 'phasing out' might be best, to give the housing market time to readjust to the new, undistorted, conditions


# feature preparation

In [11]:
# build vecor with the feature extraction code

In [12]:
from feature_selection import features_summary
# from basic_features import features_summary

ImportError: No module named 'feature_selection'

In [None]:
X = [features_summary(item) for item in X]

## Adding more features 

In [13]:
import pickle
# Load data (deserialize)
with open('argument_component.pickle', 'rb') as handle:
    text_compo_dic = pickle.load(handle)

In [14]:
texts = new_data['comment_text']

In [15]:
# adding the argument component vectors into the basic feature vector
comp_data = [text_compo_dic[item] for item in texts]

In [16]:
new_X = np.concatenate((X, comp_data ),axis=1)

## training/test dataset split

In [17]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
# print (X_train[0], len(X_train), len(y_train))
# print (X_test[0], len(X_test))

In [17]:
X_train = X[0:1716]
y_train = y[0:1716]
X_test =X[1716:]
y_test = y[1716:]

In [18]:
# adding argument componets data
new_X_train = new_X[0:1716]
new_y_train = y[0:1716]
new_X_test =new_X[1716:]
new_y_test = y[1716:]
# new_comments = 

# Build models 

### 1. regression model 

In [19]:
# linear regression model 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [20]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(new_X_train, new_y_train)

# Make predictions using the testing set
y_pred = regr.predict(new_X_test)



In [21]:
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))
print("mean_absolute_error: %.2f"
      % mean_absolute_error(y_test, y_pred))

Coefficients: 
 [  1.66011932e-02  -1.83142927e-02  -7.65621384e-02   3.61766044e-02
  -8.48954760e-03   7.38919503e-01  -2.84541353e-02   1.69589207e-01
  -7.54648636e-04  -5.74561717e-01   2.58258455e+01   2.64337362e+01
   2.64671820e+01   2.88134017e+01   2.79828625e-03   3.61981783e-03
   5.75945249e-03   1.67159287e-03   1.72478831e+01   1.72454038e+01
   1.64976136e+01   1.65561157e+01]
Mean squared error: 2.50
Variance score: 0.14
mean_absolute_error: 1.29


### 2. MLP model 

http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor

In [22]:
from sklearn.neural_network import MLPRegressor

In [23]:
mlp = MLPRegressor(
    hidden_layer_sizes=(10,),  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

In [24]:
mlp.fit(new_X_train, new_y_train)

MLPRegressor(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=9, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [25]:
pred1 = mlp.predict(new_X_test)

In [26]:
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, pred1))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, pred1))
print("mean_absolute_error: %.2f"
      % mean_absolute_error(y_test, pred1))

Mean squared error: 6.38
Variance score: -1.20
mean_absolute_error: 1.74


### adding argument component data

In [27]:
# adding a pca
from sklearn.decomposition import PCA
pca_X = pca = PCA(n_components=6)
pca.fit(new_X)
pca_X = pca.transform(new_X)

In [28]:
pca_X_train = pca_X[0:1716]
pca_y_train = y[0:1716]
pca_X_test =pca_X[1716:]
pca_y_test = y[1716:]

In [29]:
mlp1 = MLPRegressor(
    hidden_layer_sizes=(10,),  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

In [30]:
mlp1.fit(pca_X_train, pca_y_train)

MLPRegressor(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=9, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [31]:
pred2 = mlp1.predict(pca_X_test)

In [32]:
print("Mean squared error: %.2f"
      % mean_squared_error(pca_y_test, pred2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(pca_y_test, pred2))
print("mean_absolute_error: %.2f"
      % mean_absolute_error(pca_y_test, pred2))

Mean squared error: 2.25
Variance score: 0.22
mean_absolute_error: 1.19


### error analysis

In [33]:
pca_comment = comments_texts[1716:]

In [69]:
# pca_comment = np.array[pca_comment]
print (len(pca_comment))

846


In [62]:
pca_y_test = np.array(pca_y_test)
pred2 = np.array(pred2)

In [75]:
tmp = 0 
for i in range(len(pred2)):
    try:
        if abs(pred2[i] - pca_y_test[i]) > 3:
            print (comments_texts[i+1716])
            print (pred2[i])
            print (pca_y_test[i])
            tmp += 1
            print ("------")
    except:
        pass

in these days of piracy _ref Somalia - we don't know who is safe and who is armed.  so all arrivals need to go to secure accommodation until we have time and resources to assess them.  There are 5 billion people on the planet, and if 1% want to come here, that is 50 million- much more than we have water for.
1.95349238228
5.0
------
If the money goes to feeding the kids breakfast before school, or to one of the parents being able to spend more time with their children to make sure they're supported with their school work at home, I believe a child's education will benefit.
Relieving financial pressure on low income families can only benefit the child. Creating a safe and positive home environment is just as significant and important in having an impact on a child's education as books and new textbooks.
2.87791517361
6.0
------
Maybe there's only so much of the population that will actually make use of a change in the law, but the effects are much greater than that. A gay couple's frie

In [76]:
print (tmp)

32


## adding tf-idf information 

To compute the tf-idf information is because, I observed that some comments only have one sentence after removing the links in that. If this sentence is quite similiar with the title, the score would be 0, otherwise, it is 1. 

In [35]:
import pickle
# Load data (deserialize)
with open('url_sim_dic.pickle', 'rb') as handle:
    url_sim_dic = pickle.load(handle)

In [36]:
url_tfidf = [url_sim_dic[item] for item in texts]

In [37]:
data_with_tfidf = np.concatenate((new_X, url_tfidf),axis=1)

In [38]:
pca2 = PCA(n_components=4)
pca2.fit(data_with_tfidf)
data_with_tfidf = pca2.transform(data_with_tfidf)

In [39]:
print (data_with_tfidf.shape)
print (new_X.shape)

(2562, 4)
(2562, 22)


In [40]:
# adding argument componets data
X_train_tfidf = data_with_tfidf[0:1716]
y_train_tfidf = y[0:1716]
X_test_tfidf =data_with_tfidf[1716:]
y_test_tfidf = y[1716:]

In [41]:
mlp2 = MLPRegressor(
    hidden_layer_sizes=(10,),  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

In [42]:
mlp2.fit(X_train_tfidf, y_train_tfidf)

MLPRegressor(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=9, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [43]:
pred3 = mlp2.predict(X_test_tfidf)

In [44]:
print("Mean squared error: %.2f"
      % mean_squared_error(y_test_tfidf, pred3))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test_tfidf, pred3))
print("mean_absolute_error: %.2f"
      % mean_absolute_error(y_test_tfidf, pred3))

Mean squared error: 2.15
Variance score: 0.26
mean_absolute_error: 1.20


## Error analysis

In [65]:
test_data = comments_texts[1716:]
print (len(X_test), len(y_pred), len(y_test))

846 846 846


In [46]:
y_pred = np.array(pca_y_test)
y_test = np.array(y_test)
test_data = np.array(test_data)
print (y_pred[100])

2.0


In [49]:
for i in range(len(X_test)):
    if y_pred[i] - y_test[i] > 1:
        print (test_data[i+1716])
        print (y_pred[i])
        print (y_test[i])

In [37]:
print (X_test[0])

[0.0, 27.0, 3.0, 9.0, 2]
