In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


In [2]:
df = pd.read_csv("Melbourne_housing_FULL.csv")
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [8]:
print(df.isna().sum())

Rooms                                       0
Price                                    7610
Distance                                    1
Postcode                                    1
Bedroom2                                 8217
                                         ... 
Regionname_Northern Victoria                0
Regionname_South-Eastern Metropolitan       0
Regionname_Southern Metropolitan            0
Regionname_Western Metropolitan             0
Regionname_Western Victoria                 0
Length: 34884, dtype: int64


In [9]:
df = df[df['Price'].notna()]


In [10]:
features = [
    'Rooms', 'Distance', 'Bedroom2', 'Bathroom',
    'Car', 'Landsize', 'BuildingArea', 'YearBuilt'
]

df = df[features + ['Price']]

In [11]:
df.fillna(df.median(numeric_only=True), inplace=True)


In [12]:
print(df.isna().sum())


Rooms           0
Distance        0
Bedroom2        0
Bathroom        0
Car             0
Landsize        0
BuildingArea    0
YearBuilt       0
Price           0
dtype: int64


In [13]:
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Price
1,2,2.5,2.0,1.0,1.0,202.0,133.0,1970.0,1480000.0
2,2,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,1035000.0
4,3,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,1465000.0
5,3,2.5,3.0,2.0,1.0,94.0,133.0,1970.0,850000.0
6,4,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,1600000.0


In [14]:
X = df.drop("Price", axis=1)
y = df["Price"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [16]:
model = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

In [17]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [18]:
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))

RMSE: 403712.04407042125
R² Score: 0.618582462946471


In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


In [21]:
df = pd.read_csv("bank.csv")
df.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [22]:
for col in df.select_dtypes(include="object"):
    df[col] = LabelEncoder().fit_transform(df[col])


In [24]:
print(df.columns)


Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'deposit'],
      dtype='object')


In [25]:
X = df.drop("deposit", axis=1)
y = df["deposit"]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [27]:
model = RandomForestClassifier(
    n_estimators=150,
    class_weight="balanced",
    random_state=42
)

In [28]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [29]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8504254366323332
              precision    recall  f1-score   support

           0       0.88      0.83      0.85      1175
           1       0.82      0.88      0.85      1058

    accuracy                           0.85      2233
   macro avg       0.85      0.85      0.85      2233
weighted avg       0.85      0.85      0.85      2233



In [30]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


In [31]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [32]:
X = df.drop("price_range", axis=1)
y = df["price_range"]

In [33]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [34]:
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    random_state=42
)

In [35]:

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [36]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8975
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       105
           1       0.88      0.87      0.87        91
           2       0.82      0.87      0.85        92
           3       0.94      0.89      0.92       112

    accuracy                           0.90       400
   macro avg       0.90      0.90      0.90       400
weighted avg       0.90      0.90      0.90       400



In [37]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


In [39]:
df = pd.read_csv("heart_failure_clinical_records_dataset.csv")
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [40]:
X = df.drop("DEATH_EVENT", axis=1)
y = df["DEATH_EVENT"]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [42]:
model = RandomForestClassifier(
    n_estimators=200,
    oob_score=True,
    random_state=42
)

In [43]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [44]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("OOB Score:", model.oob_score_)
print(classification_report(y_test, y_pred))

Accuracy: 0.75
OOB Score: 0.8828451882845189
              precision    recall  f1-score   support

           0       0.72      0.94      0.81        35
           1       0.86      0.48      0.62        25

    accuracy                           0.75        60
   macro avg       0.79      0.71      0.72        60
weighted avg       0.78      0.75      0.73        60

