### Import Libraries

In [1]:
import pandas as pd

### Read the CSV files

In [2]:
# Define file paths
DEMOS_PATH = '../../data/raw/demos.csv'
DLCS_PATH = '../../data/raw/dlcs.csv'
DEMO_OUTPUT_PATH = '../../data/processed/cleaned_demos.csv'
DLC_OUTPUT_PATH = '../../data/processed/cleaned_dlcs.csv'

demo_df = pd.read_csv(DEMOS_PATH)
dlc_df = pd.read_csv(DLCS_PATH)

print("Demo File Columns:", demo_df.columns.tolist())
print("DLC File Columns:", dlc_df.columns.tolist())

Demo File Columns: ['Unnamed: 0', 'full_game_appid', 'demo_appid', 'name']
DLC File Columns: ['base_appid', 'dlc_appid', 'name']


### Statistics Before Preprocessing

In [3]:
demo_df.info()
dlc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15449 entries, 0 to 15448
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       15449 non-null  int64 
 1   full_game_appid  15449 non-null  object
 2   demo_appid       15449 non-null  object
 3   name             15448 non-null  object
dtypes: int64(1), object(3)
memory usage: 482.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5445 entries, 0 to 5444
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   base_appid  5445 non-null   object
 1   dlc_appid   5445 non-null   object
 2   name        5445 non-null   object
dtypes: object(3)
memory usage: 127.7+ KB


### Drop auto_incremented columns

In [4]:
demo_df.drop("Unnamed: 0", axis=1, inplace=True)

print("Demo File Columns:", demo_df.columns.tolist())
print("DLC File Columns:", dlc_df.columns.tolist())

Demo File Columns: ['full_game_appid', 'demo_appid', 'name']
DLC File Columns: ['base_appid', 'dlc_appid', 'name']


### Standardize column names

In [5]:
demo_df.columns = demo_df.columns.str.lower()
dlc_df.columns = dlc_df.columns.str.lower()
demo_df.rename(columns={"full_game_appid": "appid"}, inplace=True)
dlc_df.rename(columns={"base_appid": "appid"}, inplace=True)

print("\nAfter Standardizing Column Names:\n")
print("Demo File Columns:", demo_df.columns.tolist())
print("DLC File Columns:", dlc_df.columns.tolist())


After Standardizing Column Names:

Demo File Columns: ['appid', 'demo_appid', 'name']
DLC File Columns: ['appid', 'dlc_appid', 'name']


### Handle missing values

In [6]:
original_len_demo = len(demo_df)
original_len_dlc = len(dlc_df)
demo_df.dropna(inplace=True)
dlc_df.dropna(inplace=True)
demo_df.dropna(subset=["demo_appid", "appid"], inplace=True)
dlc_df.dropna(subset=["dlc_appid", "appid"], inplace=True)

print(f"Dropped {original_len_demo - len(demo_df)} rows in Demo DataFrame due to missing values.")
print(f"Remaining rows in Demo DataFrame: {len(demo_df)}\n")
print(f"Dropped {original_len_dlc - len(dlc_df)} rows in DLC DataFrame due to missing values.")
print(f"Remaining rows in DLC DataFrame: {len(dlc_df)}")

Dropped 1 rows in Demo DataFrame due to missing values.
Remaining rows in Demo DataFrame: 15448

Dropped 0 rows in DLC DataFrame due to missing values.
Remaining rows in DLC DataFrame: 5445


### Convert data types

In [7]:
original_len_demo = len(demo_df)
original_len_dlc = len(dlc_df)

for df, appid_col in [(demo_df, "demo_appid"), (dlc_df, "dlc_appid")]:
    # errors="coerce" means that if a value cannot be converted to a number it's replace by NaN to then be removed
    df[appid_col] = pd.to_numeric(df[appid_col], errors="coerce") 
    df["appid"] = pd.to_numeric(df["appid"], errors="coerce")

    # Drop rows where appid or demo/dlc_appid is missing/NaN
    df.dropna(subset=[appid_col, "appid"], inplace=True)

    df[appid_col] = df[appid_col].astype(int)
    df["appid"] = df["appid"].astype(int)
    df["name"] = df["name"].astype(str)

print(f"Dropped {original_len_demo - len(demo_df)} rows in Demo DataFrame due to missing values.")
print(f"Remaining rows in Demo DataFrame: {len(demo_df)}\n")
print(f"Dropped {original_len_dlc - len(dlc_df)} rows in DLC DataFrame due to missing values.")
print(f"Remaining rows in DLC DataFrame: {len(dlc_df)}")


Dropped 1 rows in Demo DataFrame due to missing values.
Remaining rows in Demo DataFrame: 15447

Dropped 1 rows in DLC DataFrame due to missing values.
Remaining rows in DLC DataFrame: 5444


### Remove duplicates by unique identifier

In [8]:
demo_duplicates = demo_df.duplicated(subset=["demo_appid"]).sum()
dlc_duplicates = dlc_df.duplicated(subset=["dlc_appid"]).sum()
demo_df.drop_duplicates(subset=["demo_appid"], inplace=True)
dlc_df.drop_duplicates(subset=["dlc_appid"], inplace=True)
print(f"Dropped {demo_duplicates} duplicate rows in Demo DataFrame based on demo_appid.")
print(f"Dropped {dlc_duplicates} duplicate rows in DLC DataFrame based on dlc_appid.")

Dropped 0 duplicate rows in Demo DataFrame based on demo_appid.
Dropped 0 duplicate rows in DLC DataFrame based on dlc_appid.


### Clean the 'name' column

In [9]:
demo_df["name"] = (demo_df["name"]
                   .str.replace(r"(?i)\bdemo\b", "", regex=True)
                   .str.replace(r"(?i)\bdlc\b", "", regex=True)
                   .str.replace(r"\s+", " ", regex=True)  # remove multiple spaces
                   .str.strip()  # remove leading/trailing spaces after replacements
                   .str.title())  # capitalize the first letter of each word

dlc_df["name"] = (dlc_df["name"]
                  .str.replace(r"(?i)\bdemo\b", "", regex=True)
                  .str.replace(r"(?i)\bdlc\b", "", regex=True)
                  .str.replace(r"\s+", " ", regex=True)
                  .str.strip()
                  .str.title()) 


### Reset Indicies

In [10]:
demo_df.reset_index(drop=True, inplace=True)
dlc_df.reset_index(drop=True, inplace=True)

### Save the preprocessed data

In [11]:
# demo_df.to_csv(DEMO_OUTPUT_PATH, index=False)
# dlc_df.to_csv(DLC_OUTPUT_PATH, index=False)

### Statistics After Preprocessing

In [12]:
demo_df.info()
dlc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15447 entries, 0 to 15446
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   appid       15447 non-null  int64 
 1   demo_appid  15447 non-null  int64 
 2   name        15447 non-null  object
dtypes: int64(2), object(1)
memory usage: 362.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5444 entries, 0 to 5443
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   appid      5444 non-null   int64 
 1   dlc_appid  5444 non-null   int64 
 2   name       5444 non-null   object
dtypes: int64(2), object(1)
memory usage: 127.7+ KB


### Summary

In [13]:
print("\nPreprocessing Summary:")
print(f"Demo DataFrame: {len(demo_df)} rows, {len(demo_df.columns)} columns")
print(f"DLC DataFrame: {len(dlc_df)} rows, {len(dlc_df.columns)} columns\n")

print(dlc_df.head())
print("\n")
print(demo_df.head())


Preprocessing Summary:
Demo DataFrame: 15447 rows, 3 columns
DLC DataFrame: 5444 rows, 3 columns

     appid  dlc_appid                                             name
0  1786750    2568660                        家出王女 - 全年齢版ストーリー&グラフィック追加
1  1981700    2563730                           Jacob'S Quest - Voyage
2  2009450    2552980  Invector: Rhythm Galaxy - Latin Power Song Pack
3  1133420    2550750         Hero Or Villain: Genesis — Supercharged!
4  2533950    2551000            Hot And Lovely ：Uniform - Adult Patch


     appid  demo_appid                    name
0  2214650     2573370          Rolando Deluxe
1  1439980     2573460  Outrunner: Neon Nights
2  2412240     2572840     Bubble Ghost Remake
3  2448830     2572240           Time Handlers
4  2379590     2570800    Hope'S Final Defense
