In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.linear_model import LinearRegression
import os
from sys import platform

 Table

# Instructions

1. Load the `train.csv` file
2. Explore the data, understand it
3. Process it for future training
4. Do train, test, split for your `train.csv` file
5. `fit/train` a model from your cleaned_train_df
-----
5. Load the `test.csv` file
6. Apply the same processing you did to `train.csv` into `test.csv`
7. `predict` the price for that file
8. Only keep the columns you need
9. Export
-----
10. Repeat! 🚀🔥

# Import the csv files

In [7]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

In [8]:
df_train.head(10)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.02,Good,D,VS2,63.2,58.0,6.36,6.4,4.03,8.928
1,1,0.35,Very Good,H,VVS2,61.0,57.0,4.54,4.57,2.77,6.477
2,2,0.31,Premium,H,VVS1,60.5,58.0,4.43,4.4,2.67,6.81
3,3,0.38,Ideal,E,VS1,61.4,56.0,4.66,4.69,2.87,6.824
4,4,1.64,Ideal,G,VVS2,61.8,56.0,7.59,7.6,4.69,9.776
5,5,1.62,Ideal,H,VS2,62.4,57.0,7.48,7.53,4.68,9.316
6,6,0.9,Very Good,F,SI1,59.6,63.0,6.24,6.17,3.7,8.292
7,7,1.1,Ideal,E,SI2,61.6,56.0,6.63,6.66,4.09,8.568
8,8,0.33,Ideal,G,VS1,62.3,57.0,4.42,4.38,2.74,6.75
9,9,0.59,Very Good,D,VVS1,61.8,57.0,5.37,5.41,3.33,8.081


In [9]:
df_test.head(10)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.81,Ideal,F,SI1,61.5,57.0,6.01,6.06,3.71
1,1,0.5,Fair,F,I1,63.8,58.0,5.08,4.97,3.21
2,2,0.31,Ideal,D,VVS2,60.1,56.0,4.43,4.46,2.67
3,3,1.52,Fair,I,SI2,64.7,58.0,7.19,7.22,4.66
4,4,0.35,Premium,D,VVS1,60.8,58.0,4.55,4.53,2.76
5,5,0.36,Ideal,F,SI1,61.5,56.0,4.59,4.62,2.83
6,6,1.02,Ideal,H,SI2,62.5,53.0,6.43,6.47,4.03
7,7,0.5,Premium,D,SI2,62.9,58.0,5.06,5.01,3.17
8,8,1.01,Premium,G,I1,61.7,62.0,6.41,6.35,3.93
9,9,1.92,Very Good,E,SI2,62.5,59.0,7.85,7.92,4.93


In [10]:
df_train.shape

(40455, 11)

In [11]:
df_test.shape

(13485, 10)

# SUBMISSION -> test

In [12]:
print(df_train.shape[0])
df_train.sample()

40455


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
5893,5893,0.3,Ideal,D,VS2,62.4,56.0,4.32,4.27,2.68,6.815


In [13]:
print(df_test.shape[0])
df_test.sample()

13485


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
4394,4394,0.7,Premium,E,VS2,61.4,59.0,5.73,5.7,3.51


# Cleaning, processing, feature selection, etc

In [14]:
# Processing is necessary, otherwise we won't be able to fit a model
# For the sake of the example, we'll just drop categorical columns

df_train_cleaned = df_train.select_dtypes(exclude='object')
print(df_train_cleaned.shape[0])
df_train_cleaned

40455


Unnamed: 0,id,carat,depth,table,x,y,z,price
0,0,1.02,63.2,58.0,6.36,6.40,4.03,8.928
1,1,0.35,61.0,57.0,4.54,4.57,2.77,6.477
2,2,0.31,60.5,58.0,4.43,4.40,2.67,6.810
3,3,0.38,61.4,56.0,4.66,4.69,2.87,6.824
4,4,1.64,61.8,56.0,7.59,7.60,4.69,9.776
...,...,...,...,...,...,...,...,...
40450,40450,1.20,62.2,55.0,6.77,6.81,4.23,9.149
40451,40451,1.50,64.2,56.0,7.30,7.09,4.62,9.077
40452,40452,1.06,61.9,55.0,6.54,6.58,4.06,8.892
40453,40453,0.31,60.1,58.0,4.40,4.38,2.64,6.385


# Train on train.csv

![](https://builtin.com/sites/www.builtin.com/files/styles/ckeditor_optimize/public/inline-images/4_train-test-split.jpg)

## Train, test split

In [15]:
X = df_train_cleaned.iloc[:,:-1]
y = df_train_cleaned['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

## Fit

In [16]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Just for feedback
if platform == "darwin":
    os.system("say I'm done training")

In [31]:
y_pred = regressor.predict(X_test)

In [32]:
len(y_pred)

14160

In [18]:
np.sqrt(mean_squared_error(y_pred, y_test))

0.3277510957425164

-----
-----
-----
-----


# Applying same cleaning & processing to my `test.csv`

In [19]:
df_test_cleaned = df_test.select_dtypes(exclude='object')
print(df_test_cleaned.shape[0])
df_test_cleaned.sample()

13485


Unnamed: 0,id,carat,depth,table,x,y,z
11196,11196,0.54,60.9,56.0,5.31,5.27,3.22


# Predict on the `test.csv`

In [20]:
from sklearn import metrics

# Just for feedback
if platform == "darwin":
    os.system("say -v ayam don predictin")

# DF with two columns

In [21]:
print(df_test_cleaned.shape[0])
df_test_cleaned.sample()

13485


Unnamed: 0,id,carat,depth,table,x,y,z
4460,4460,0.42,61.8,56.0,4.82,4.86,2.99


In [25]:
y_pred = y_pred[:13485]
print("Length of y_pred_rf:", len(y_pred))
print("Length of df_test before reset:", len(df_test))

Length of y_pred_rf: 13485
Length of df_test before reset: 13485


In [26]:
df_test_cleaned['price'] = y_pred # Adding the predicted price
df_for_submission = df_test_cleaned[["id", "price"]] # Modifying for subnmission

In [27]:
print(df_for_submission.shape[0])
df_for_submission.sample()

13485


Unnamed: 0,id,price
4721,4721,7.176741


# Export (index=False)

In [29]:
df_for_submission.to_csv("my_submission.csv", index=False)

# Just for feedback
if platform == "darwin":
    os.system("say redi for submission")

In [30]:
df_for_submission.head()

Unnamed: 0,id,price
0,0,8.355965
1,1,8.493786
2,2,7.839232
3,3,8.599336
4,4,6.866394
