In [334]:
#firstly, we import pandas in order to work with the data set
import pandas as pd
import numpy as np

#First step from creating the model is to import the important library for machine learning, which in our situation is scikit-learn.
import sklearn as scikit_learn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [335]:
#secondly, we have to configure the file path in order to load the data f
# Load the dataset
file_path = "Data/set8_it1.csv"  # Update with your file path
df = pd.read_csv(file_path)

In [336]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2363 entries, 0 to 2362
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   DI              2363 non-null   int64  
 1   TradeValue      2363 non-null   float64
 2   RealmType       2363 non-null   object 
 3   Enclave         2363 non-null   object 
 4   LivingQuarters  2244 non-null   float64
 5   ParcelSize      2363 non-null   float64
 6   ParcelSizeUnit  2363 non-null   object 
dtypes: float64(3), int64(1), object(3)
memory usage: 129.4+ KB


In [337]:
# 1. Remove negative values in ParcelSize
df = df[df["ParcelSize"] >= 10]

In [338]:
# 2. Remove rows with missing LivingQuarters data
df["LivingQuarters"] = df["LivingQuarters"].fillna(df["LivingQuarters"].median())

In [339]:
# 3. Remove rows where ParcelSize is less than 1
#df = df[df["ParcelSize"] >= 1]

In [340]:
# 4. Remove duplicate rows
df = df.drop_duplicates()

In [341]:
# 5. Standardize Enclave column (lowercase and strip spaces)
df["Enclave"] = df["Enclave"].str.strip().str.lower()
# Define a mapping to correct similar enclave names
enclave_corrections = {
    "shadowmeres": "shadowmere",
    "mystvales": "mystvale",
    "dragonspires": "dragonspire",
    "stormwatchs": "stormwatch",
    "thornfields": "thornfield",
    "petalbridges": "petalbridge",
    "silverglens": "silverglen",
    "riverbends": "riverbend",
    "emberfalls": "emberfall"
}

# Replace incorrect names with correct ones
df["Enclave"] = df["Enclave"].replace(enclave_corrections)

# Verify the results
df["Enclave"].unique()

array(['shadowmere', 'ravenstone', 'petalbridge', 'emberfall',
       'dragonspire', 'riverbend', 'cloudrest', 'sunspire', 'nighthaven',
       'brightforge', 'wyverncliff', 'crystalhollow', 'wolfsbane',
       'duskwood', 'mystvale', 'glimmerwood', 'stormhold', 'starfall',
       'silverglen', 'thornfield', 'dreamweaver', 'stormwatch',
       'windwhisper', 'oakenshade', 'mossheart', 'verdantia', 'moonglade'],
      dtype=object)

In [342]:
# 6. Convert ParcelSize to sqft if it is in sqm
conversion_factor = 10.7639  # 1 sqm = 10.7639 sqft
df.loc[df["ParcelSizeUnit"] == "sqm", "ParcelSize"] *= conversion_factor
df.loc[df["ParcelSizeUnit"] == "sqm", "ParcelSizeUnit"] = "sqft"  # Standardizing to sqft

In [343]:
# 7. Investigate TradeValue minimum
min_trade_value = df["TradeValue"].min()
outlier_trade_value = df[df["TradeValue"] == min_trade_value]

In [344]:
# 8. Detect outliers using 1.5*IQR method
def detect_outliers(column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) & (df[column] > upper_bound)]

In [345]:
outliers_trade_value = detect_outliers("TradeValue")
outliers_living_quarters = detect_outliers("LivingQuarters")
outliers_parcel_size = detect_outliers("ParcelSize")

# Get index values of all outliers
outlier_indices = (
    outliers_trade_value.index.to_list() + 
    outliers_living_quarters.index.to_list() + 
    outliers_parcel_size.index.to_list()
)

# Drop these rows from the dataset
df = df.drop(index=outlier_indices)

In [346]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2318 entries, 0 to 2362
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   DI              2318 non-null   int64  
 1   TradeValue      2318 non-null   float64
 2   RealmType       2318 non-null   object 
 3   Enclave         2318 non-null   object 
 4   LivingQuarters  2318 non-null   float64
 5   ParcelSize      2318 non-null   float64
 6   ParcelSizeUnit  2318 non-null   object 
dtypes: float64(3), int64(1), object(3)
memory usage: 144.9+ KB


In [347]:
df

Unnamed: 0,DI,TradeValue,RealmType,Enclave,LivingQuarters,ParcelSize,ParcelSizeUnit
0,0,125000.0,Meadowlands,shadowmere,1032.0,9280.0,sqft
1,1,250000.0,Meadowlands,ravenstone,1933.0,11170.0,sqft
2,2,130000.0,GnomeBurough,petalbridge,1012.0,5500.0,sqft
3,3,157500.0,Meadowlands,emberfall,843.0,13014.0,sqft
4,4,156000.0,Meadowlands,dragonspire,807.0,8092.0,sqft
...,...,...,...,...,...,...,...
2358,2335,135000.0,Meadowlands,dragonspire,1110.5,8064.0,sqft
2359,2336,334000.0,Meadowlands,glimmerwood,2173.0,11025.0,sqft
2360,2337,214000.0,Meadowlands,dragonspire,1138.0,8197.0,sqft
2361,2338,129500.0,Meadowlands,emberfall,759.0,9100.0,sqft


In [348]:
# The next step is to connect the prepared data from above and to use it for getting the mean and median of the TradeValue variable. 
mean_trade_value = df['TradeValue'].mean()
median_trade_value = df['TradeValue'].median()

In [349]:
#After we have connected the data, we are transforming the categorical data into numerical using the label encoder so the model can fit all of that information so it can make predictions
label_encoder_realm = LabelEncoder()
label_encoder_enclave = LabelEncoder()
df['RealmType'] = label_encoder_realm.fit_transform(df['RealmType'])
df['Enclave'] = label_encoder_enclave.fit_transform(df['Enclave'])

In [350]:
#After that we separate our data into two categories: categorical data, which we transform and data we want to predict. They are separated in X, y respectively
X = df.drop(['TradeValue', 'ParcelSizeUnit'], axis=1)
y = df['TradeValue']

In [351]:
#Then we assign the data, where we are using 80% of the data for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   


In [352]:
#The final step is we have created the model, fit in the data and trained it to make predictions
value = y_train.median()
pred_train = [value] * len(y_train)
pred_test = [value] * len(y_test)

In [353]:
#After the predictions we are using metrics such as Mean Absolute Error and Mean squared error to see what the efficiency of our model is
mae = mean_absolute_error(y_test, pred_test) 
mse = mean_squared_error(y_test, pred_test)

In [354]:
mae

55169.150862068964

In [355]:
mean_absolute_percentage_error(y_test, pred_test)

0.2990899126672557

In [356]:
mse

6820405519.047414

In [357]:
df2 = df.copy()
df2 = df2.drop(columns=["ParcelSizeUnit"])
df2.corr()

Unnamed: 0,DI,TradeValue,RealmType,Enclave,LivingQuarters,ParcelSize
DI,1.0,-0.013513,0.023966,0.003944,-0.038318,-0.003256
TradeValue,-0.013513,1.0,0.070523,0.183309,0.688021,0.280705
RealmType,0.023966,0.070523,1.0,-0.39736,0.059819,0.201729
Enclave,0.003944,0.183309,-0.39736,1.0,0.100299,-0.028083
LivingQuarters,-0.038318,0.688021,0.059819,0.100299,1.0,0.316218
ParcelSize,-0.003256,0.280705,0.201729,-0.028083,0.316218,1.0


In [358]:
df["Enclave"].unique()

array([16, 14, 13,  6,  3, 15,  1, 21, 11,  0, 26,  2, 25,  5, 10,  7, 19,
       18, 17, 22,  4, 20, 24, 12,  9, 23,  8])