In [None]:
import pandas as pd
import pandas_profiling as pp
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
data = pd.read_csv('/kaggle/input/predict-the-number-of-upvotes-a-post-will-get/test_8i3B3FC.csv')
data_test = pd.read_csv('/kaggle/input/predict-the-number-of-upvotes-a-post-will-get/train_NIR5Yl1.csv')

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
pp.ProfileReport(data)

### Data Analysis

- Number of features are not very large.
- We have categorical data - Tag
- We have ID column , which may not be required as it just like an indexing.
- Numerical data is not normalized.
- No null value is present.
- Username can be duplicate , as same user can have multiple post, either remove it or have some conversion before use it.
- Upvote is highly skewed - may be be should bucketize it.
- Tag have 10 unique values
- Upvotes is highly dependent on View, reputation and Answers

### Data Visualization

In [None]:
## Upvote - how dependent on the tag
%matplotlib inline
sns.catplot(x="Tag", y="Upvotes", data=data);

In [None]:
sns.distplot(data.Upvotes, bins=50, kde=False, rug=True);

### Data Manipulation

- shuffle the data.
- remove id and username feature for the first cut.
- convert categorical feature tag into.
- normalize the data.

- can bucketize few features - Upvotes. 

### Predict Upvote - Regression Phase 1 

In [None]:
data = shuffle(data)

In [None]:
print('Coloumn Names ', data.columns)
features = ['Reputation', 'Answers', 'Views']
label = ['Upvotes']

In [None]:
def preprocess_data(data_set, features, labels, scaler= None, binarizer= None , is_test_data = False):
    
    if not is_test_data:
        binarizer = LabelEncoder()
        scaler = StandardScaler()
        binarizer.fit(data_set.Tag)
    tag = binarizer.transform(data_set.Tag)
    shape = tag.shape[0]
    tag_reshaped = tag.reshape(shape,1)
    X = np.concatenate((data_set[features].values, tag_reshaped), axis = 1)
    y = data_set[labels].values
    if not is_test_data:
        scaler.fit(X)
    X = scaler.transform(X)    
    return X, y, scaler, binarizer

In [None]:
X, y, scaler, binarizer = preprocess_data(data_set=data, features=features, labels= label)

In [None]:
print('Shape of X', X.shape)
print('Shape of y', y.shape)

In [None]:
 X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
print('Shape of X train', X_train.shape)
print('Shape of y train', y_train.shape)

print('Shape of X val', X_val.shape)
print('Shape of y val', y_val.shape)

In [None]:
## Normal Equation Implementation
step1 = np.dot(X_train.T, X_train)
step2 = np.linalg.pinv(step1)
step3 = np.dot(step2, X_train.T)
theta = np.dot(step3, y_train)

In [None]:
print('Shape of the Thetha', theta.shape)

In [None]:
y_val_pred = np.dot(X_val, theta)

In [None]:
print('Shape of the prediction', y_val_pred.shape)

In [None]:
print('MAE: ', mean_absolute_error(y_val_pred, y_val))
print('MSE: ', mean_squared_error(y_val_pred, y_val))


- Since we have few data pointers with large value, because of this it may be MSE is very very large compare to MAE
- Error seems to be very large.

In [None]:
regressor = SGDRegressor(max_iter=10000, tol=1e-3)
regressor.fit(X_train, y_train)

In [None]:
y_val_pred = regressor.predict(X_val)

In [None]:
print('Shape of the prediction', y_val_pred.shape)

In [None]:
print('MAE: ', mean_absolute_error(y_val_pred, y_val))
print('MSE: ', mean_squared_error(y_val_pred, y_val))