<a href="https://colab.research.google.com/github/quanggquangg/phone_number_predict/blob/main/DecisionTree_Predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures

path = '/content/dataset_cleaned_big.csv'
df = pd.read_csv(path)
df = df.drop(labels = 'Unnamed: 0', axis = 1)

In [2]:
df.head()

Unnamed: 0,price_vnd,sim_number,sim_price_range,network,sim_tam_hoa,sim_tu_quy,sim_ngu_quy,sim_nam_sinh,sim_loc_phat,sim_so_tien,sim_than_tai,sim_lap,sim_dao,sim_lap_kep,sim_kep,sim_lap_ba,sim_so_lui,4_so_cuoi
0,12000000,926052005,0,4,0,0,0,1,0,0,0,0,0,0,0,0,0,2005
1,199000000,769889999,0,2,1,1,0,0,0,0,0,1,1,1,0,0,0,9999
2,104000000,786008888,0,2,1,1,0,0,0,0,0,1,1,1,0,0,0,8888
3,14000000,834141141,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,1141
4,11325000,856637678,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,7678


In [3]:
df.shape

(34784, 18)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34784 entries, 0 to 34783
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   price_vnd        34784 non-null  int64
 1   sim_number       34784 non-null  int64
 2   sim_price_range  34784 non-null  int64
 3   network          34784 non-null  int64
 4   sim_tam_hoa      34784 non-null  int64
 5   sim_tu_quy       34784 non-null  int64
 6   sim_ngu_quy      34784 non-null  int64
 7   sim_nam_sinh     34784 non-null  int64
 8   sim_loc_phat     34784 non-null  int64
 9   sim_so_tien      34784 non-null  int64
 10  sim_than_tai     34784 non-null  int64
 11  sim_lap          34784 non-null  int64
 12  sim_dao          34784 non-null  int64
 13  sim_lap_kep      34784 non-null  int64
 14  sim_kep          34784 non-null  int64
 15  sim_lap_ba       34784 non-null  int64
 16  sim_so_lui       34784 non-null  int64
 17  4_so_cuoi        34784 non-null  int64
dtypes: int

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price_vnd,34784.0,72485520.0,1416408000.0,10000000.0,11000000.0,12000000.0,50000000.0,168000000000.0
sim_number,34784.0,783474400.0,195270000.0,325012006.0,775557721.5,837411283.0,916565200.0,997979700.0
sim_price_range,34784.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
network,34784.0,2.20656,0.9702895,1.0,1.0,2.0,3.0,4.0
sim_tam_hoa,34784.0,0.4665938,0.49889,0.0,0.0,0.0,1.0,1.0
sim_tu_quy,34784.0,0.2035706,0.402659,0.0,0.0,0.0,0.0,1.0
sim_ngu_quy,34784.0,0.05186293,0.2217534,0.0,0.0,0.0,0.0,1.0
sim_nam_sinh,34784.0,0.1036971,0.3048716,0.0,0.0,0.0,0.0,1.0
sim_loc_phat,34784.0,0.08800023,0.2832993,0.0,0.0,0.0,0.0,1.0
sim_so_tien,34784.0,0.12546,0.3312445,0.0,0.0,0.0,0.0,1.0


In [6]:
print("Missing Values by Column")
print("-"*30)
print(df.isna().sum())
print("-"*30)
print("TOTAL MISSING VALUES:",df.isna().sum().sum())

Missing Values by Column
------------------------------
price_vnd          0
sim_number         0
sim_price_range    0
network            0
sim_tam_hoa        0
sim_tu_quy         0
sim_ngu_quy        0
sim_nam_sinh       0
sim_loc_phat       0
sim_so_tien        0
sim_than_tai       0
sim_lap            0
sim_dao            0
sim_lap_kep        0
sim_kep            0
sim_lap_ba         0
sim_so_lui         0
4_so_cuoi          0
dtype: int64
------------------------------
TOTAL MISSING VALUES: 0


In [7]:
X = df.drop("price_vnd", axis=1)
y = df["price_vnd"]

In [8]:
X = X.drop("sim_number", axis=1)

In [9]:
X = X.drop("sim_price_range", axis=1)

In [10]:
X = X.drop("4_so_cuoi", axis=1)

In [11]:
X.head()

Unnamed: 0,network,sim_tam_hoa,sim_tu_quy,sim_ngu_quy,sim_nam_sinh,sim_loc_phat,sim_so_tien,sim_than_tai,sim_lap,sim_dao,sim_lap_kep,sim_kep,sim_lap_ba,sim_so_lui
0,4,0,0,0,1,0,0,0,0,0,0,0,0,0
1,2,1,1,0,0,0,0,0,1,1,1,0,0,0
2,2,1,1,0,0,0,0,0,1,1,1,0,0,0
3,3,0,0,0,0,0,0,0,0,0,0,0,1,0
4,3,0,0,0,0,0,1,0,0,0,0,0,0,0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
X_test.shape

(6957, 14)

In [14]:
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5)).mean()
    return rmse
    

def evaluation(y, predictions):
    mae = mean_absolute_error(y, predictions)
    mse = mean_squared_error(y, predictions)
    rmse = np.sqrt(mean_squared_error(y, predictions))
    r_squared = r2_score(y, predictions)
    return mae, mse, rmse, r_squared

In [15]:
models = pd.DataFrame(columns=["Model","MAE","MSE","RMSE","R2 Score","RMSE (Cross-Validation)"])

In [16]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(clf)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "Decision Tree","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

MAE: 44103088.186574675
MSE: 1.3598149450834102e+16
RMSE: 116611103.46289542
R2 Score: 0.9418411032599607
------------------------------
RMSE Cross-Validation: 122228119.75751188


In [21]:
path = '/content/dataset_cleaned_big_test.csv'
test = pd.read_csv(path)

In [22]:
test.head()

Unnamed: 0,sim_number,sim_price_range,network,sim_tam_hoa,sim_tu_quy,sim_ngu_quy,sim_nam_sinh,sim_loc_phat,sim_so_tien,sim_than_tai,sim_lap,sim_dao,sim_lap_kep,sim_kep,sim_lap_ba,sim_so_lui,4_so_cuoi
0,707963979,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,3979
1,818999191,0,3,1,0,0,0,0,0,0,1,0,0,0,0,0,9191
2,585444777,0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,4777
3,378569788,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,9788
4,386495666,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,5666


In [23]:
test = test.drop(labels = 'sim_number', axis = 1)

In [24]:
test = test.drop(labels = '4_so_cuoi', axis = 1)

In [26]:
test = test.drop(labels = 'sim_price_range', axis = 1)

In [27]:
test.head()

Unnamed: 0,network,sim_tam_hoa,sim_tu_quy,sim_ngu_quy,sim_nam_sinh,sim_loc_phat,sim_so_tien,sim_than_tai,sim_lap,sim_dao,sim_lap_kep,sim_kep,sim_lap_ba,sim_so_lui
0,2,0,0,0,0,0,0,1,0,0,0,0,0,0
1,3,1,0,0,0,0,0,0,1,0,0,0,0,0
2,4,1,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
result = clf.predict(test)

ValueError: ignored

In [32]:
print(result)

[10000000 12000000 10000000 ... 10000000 12000000 10000000]


In [33]:
path = '/content/dataset_cleaned_big_test.csv'
test1 = pd.read_csv(path)

In [34]:
test1.head()

Unnamed: 0,sim_number,sim_price_range,network,sim_tam_hoa,sim_tu_quy,sim_ngu_quy,sim_nam_sinh,sim_loc_phat,sim_so_tien,sim_than_tai,sim_lap,sim_dao,sim_lap_kep,sim_kep,sim_lap_ba,sim_so_lui,4_so_cuoi
0,707963979,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,3979
1,818999191,0,3,1,0,0,0,0,0,0,1,0,0,0,0,0,9191
2,585444777,0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,4777
3,378569788,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,9788
4,386495666,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,5666


In [35]:
test1['price_vnd'] = result

In [36]:
test1.head()

Unnamed: 0,sim_number,sim_price_range,network,sim_tam_hoa,sim_tu_quy,sim_ngu_quy,sim_nam_sinh,sim_loc_phat,sim_so_tien,sim_than_tai,sim_lap,sim_dao,sim_lap_kep,sim_kep,sim_lap_ba,sim_so_lui,4_so_cuoi,price_vnd
0,707963979,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,3979,10000000
1,818999191,0,3,1,0,0,0,0,0,0,1,0,0,0,0,0,9191,12000000
2,585444777,0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,4777,10000000
3,378569788,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,9788,10000000
4,386495666,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,5666,10000000


In [38]:
test1.to_csv('/content/big_test.csv')

In [30]:
test.head()

Unnamed: 0,network,sim_tam_hoa,sim_tu_quy,sim_ngu_quy,sim_nam_sinh,sim_loc_phat,sim_so_tien,sim_than_tai,sim_lap,sim_dao,sim_lap_kep,sim_kep,sim_lap_ba,sim_so_lui,pre
0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,10000000
1,3,1,0,0,0,0,0,0,1,0,0,0,0,0,12000000
2,4,1,0,0,0,0,0,0,0,0,0,0,0,0,10000000
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,10000000
4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,10000000


In [209]:
y_test.head()

18969    11500000
23087    10500000
6887     12000000
27899    12000000
10980    50000000
Name: price_vnd, dtype: int64

In [196]:
path = '/content/dataset_cleaned_big.csv'
dff = pd.read_csv(path)


In [197]:
dff.head()

Unnamed: 0.1,Unnamed: 0,price_vnd,sim_number,sim_price_range,network,sim_tam_hoa,sim_tu_quy,sim_ngu_quy,sim_nam_sinh,sim_loc_phat,sim_so_tien,sim_than_tai,sim_lap,sim_dao,sim_lap_kep,sim_kep,sim_lap_ba,sim_so_lui,4_so_cuoi
0,0,12000000,926052005,0,4,0,0,0,1,0,0,0,0,0,0,0,0,0,2005
1,1,199000000,769889999,0,2,1,1,0,0,0,0,0,1,1,1,0,0,0,9999
2,2,104000000,786008888,0,2,1,1,0,0,0,0,0,1,1,1,0,0,0,8888
3,3,14000000,834141141,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,1141
4,4,11325000,856637678,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,7678


In [206]:
X_test.head()

Unnamed: 0,network,sim_tam_hoa,sim_tu_quy,sim_ngu_quy,sim_nam_sinh,sim_loc_phat,sim_so_tien,sim_than_tai,sim_lap,sim_dao,sim_lap_kep,sim_kep,sim_lap_ba,sim_so_lui,4_so_cuoi
18969,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1888
23087,3,1,0,0,0,0,0,0,0,0,0,0,0,0,4999
6887,2,0,0,0,0,0,0,0,0,0,0,0,0,0,9389
27899,1,1,0,0,0,0,0,0,0,0,0,0,0,0,8966
10980,3,1,1,0,0,0,0,0,1,1,1,0,0,0,6666
