<a href="https://colab.research.google.com/github/qxygxt/Thermal-Prediction/blob/main/Thermal_XGB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
import xgboost

In [None]:
train_data = "/content/drive/MyDrive/ThermalPrediction-Data/trainData.csv"
test_data = "/content/drive/MyDrive/ThermalPrediction-Data/testData.csv"

train_df = pd.read_csv(train_data)
test_df = pd.read_csv(test_data)

train_df.shape, test_df.shape

((1000, 53), (21517, 53))

In [None]:
train_df.columns

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
train_df['Season'].value_counts()

Summer    394
Winter    341
Autumn    145
Spring    120
Name: Season, dtype: int64

In [None]:
def season_process(season):
  if season == "Spring":
    return 1.0
  elif season == "Summer":
    return 2.0
  elif season == "Autumn":
    return 3.0
  elif season == "Winter":
    return 4.0
  else:
    return float(season)

def feature_process(df):

  if "City" in df.columns:
    df.drop(["City"], axis=1, inplace=True)

  df['Season'] = df['Season'].apply(lambda x: season_process(x) )

  df['Thermal preference'] = df['Thermal preference'].apply(lambda x: int(float(x)) if x else 1)

  for feature in df.columns:
    if feature in ["Season", ]:
      continue
    df[feature] = df[feature].apply(lambda x: float(x) if x else 0.0)

In [None]:
feature_process(train_df)

In [None]:
feature_process(test_df)

In [None]:
train_df['Season'].value_counts()

2.0    394
4.0    341
3.0    145
1.0    120
Name: Season, dtype: int64

In [None]:
train_df['Thermal preference'].value_counts()

2.0    396
1.0    364
3.0    240
Name: Thermal preference, dtype: int64

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 52 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Year                                              1000 non-null   float64
 1   Season                                            1000 non-null   float64
 2   Koppen climate classification                     1000 non-null   float64
 3   Climate                                           1000 non-null   float64
 4   Country                                           1000 non-null   float64
 5   Building type                                     1000 non-null   float64
 6   Cooling startegy_building level                   1000 non-null   float64
 7   Cooling startegy_operation mode for MM buildings  1000 non-null   float64
 8   Heating strategy_building level                   1000 non-null   float64
 9   Age                 

In [None]:
test_df.info()

In [None]:
x = train_df.drop(['Thermal preference'], axis=1)

y = train_df['Thermal preference']

x_test = test_df.drop(['Thermal preference'], axis=1)

y_test = test_df['Thermal preference']

x.shape, y.shape, x_test.shape, y_test.shape

((1000, 51), (1000,), (21517, 51), (21517,))

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y_test = le.fit_transform(y_test)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((800, 51), (200, 51), (800,), (200,))

##**Train**

In [None]:
early_stopping_rounds = 100
metric_name = "mlogloss"

In [None]:
model = xgboost.XGBClassifier(
            learning_rate=0.1,
            n_estimators=600,
            max_depth=2,
            min_child_weight=1,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="multi:softmax",
            num_class=3,
            nthread=4,
            seed=27)

In [None]:
es = [
    xgboost.callback.EarlyStopping(
      metric_name=metric_name,
      rounds=early_stopping_rounds,
      save_best=True
    )
]

In [None]:
model.fit(x_train, y_train, eval_set=[(x_val, y_val), ], callbacks=es)

[0]	validation_0-mlogloss:1.07154
[1]	validation_0-mlogloss:1.04842
[2]	validation_0-mlogloss:1.02357
[3]	validation_0-mlogloss:1.00547
[4]	validation_0-mlogloss:0.98311
[5]	validation_0-mlogloss:0.95969
[6]	validation_0-mlogloss:0.94896
[7]	validation_0-mlogloss:0.93303
[8]	validation_0-mlogloss:0.91751




[9]	validation_0-mlogloss:0.90698
[10]	validation_0-mlogloss:0.89930
[11]	validation_0-mlogloss:0.88877
[12]	validation_0-mlogloss:0.87917
[13]	validation_0-mlogloss:0.86778
[14]	validation_0-mlogloss:0.86043
[15]	validation_0-mlogloss:0.84830
[16]	validation_0-mlogloss:0.84132
[17]	validation_0-mlogloss:0.83301
[18]	validation_0-mlogloss:0.82789
[19]	validation_0-mlogloss:0.82179
[20]	validation_0-mlogloss:0.81867
[21]	validation_0-mlogloss:0.81284
[22]	validation_0-mlogloss:0.80741
[23]	validation_0-mlogloss:0.80313
[24]	validation_0-mlogloss:0.79772
[25]	validation_0-mlogloss:0.79354
[26]	validation_0-mlogloss:0.78777
[27]	validation_0-mlogloss:0.78122
[28]	validation_0-mlogloss:0.77705
[29]	validation_0-mlogloss:0.77379
[30]	validation_0-mlogloss:0.77147
[31]	validation_0-mlogloss:0.76766
[32]	validation_0-mlogloss:0.76402
[33]	validation_0-mlogloss:0.75813
[34]	validation_0-mlogloss:0.75472
[35]	validation_0-mlogloss:0.75145
[36]	validation_0-mlogloss:0.74609
[37]	validation_0-mlo

In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.75      0.78      7517
           1       0.73      0.76      0.74      8708
           2       0.71      0.74      0.73      5292

    accuracy                           0.75     21517
   macro avg       0.75      0.75      0.75     21517
weighted avg       0.76      0.75      0.75     21517



In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
accuracy = accuracy_score(y_test, y_pred)
rc=recall_score(y_test, y_pred, average= "macro")
pr=precision_score(y_test, y_pred, average= "macro")
f_1=f1_score(y_test, y_pred,average= "macro")
print("Accuracy:", accuracy)
print("Recall:", rc)
print("pr:", pr)
print("F1:", f_1)

Accuracy: 0.7531719105823302
Recall: 0.7517968560025693
pr: 0.7527343269001329
F1: 0.7515998351346332
