### Import Libraries

In [244]:
import pandas as pd

### Read the CSV files

In [245]:
# Define file paths
DEMOS_PATH = '../../data/raw/demos.csv'
DLCS_PATH = '../../data/raw/dlcs.csv'
DEMO_OUTPUT_PATH = '../../data/processed/cleaned_demos.csv'
DLC_OUTPUT_PATH = '../../data/processed/cleaned_dlcs.csv'
BASE_GAMES_PATH = '../../data/raw/info_base_games.csv'
BASE_GAMES_OUTPUT_PATH = '../../data/processed/info_base_games_with_features.csv'

demo_df = pd.read_csv(DEMOS_PATH)
dlc_df = pd.read_csv(DLCS_PATH)
info_base_games_df = pd.read_csv(BASE_GAMES_PATH)

print("Demo File Columns:", demo_df.columns.tolist())
print("DLC File Columns:", dlc_df.columns.tolist())

Demo File Columns: ['Unnamed: 0', 'full_game_appid', 'demo_appid', 'name']
DLC File Columns: ['base_appid', 'dlc_appid', 'name']


  info_base_games_df = pd.read_csv(BASE_GAMES_PATH)


### Statistics Before Preprocessing

In [246]:
demo_df.info()
dlc_df.info()
info_base_games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15449 entries, 0 to 15448
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       15449 non-null  int64 
 1   full_game_appid  15449 non-null  object
 2   demo_appid       15449 non-null  object
 3   name             15448 non-null  object
dtypes: int64(1), object(3)
memory usage: 482.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5445 entries, 0 to 5444
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   base_appid  5445 non-null   object
 1   dlc_appid   5445 non-null   object
 2   name        5445 non-null   object
dtypes: object(3)
memory usage: 127.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99167 entries, 0 to 99166
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   appid     

### Drop auto_incremented columns

In [247]:
demo_df.drop("Unnamed: 0", axis=1, inplace=True)

print("Demo File Columns:", demo_df.columns.tolist())
print("DLC File Columns:", dlc_df.columns.tolist())

Demo File Columns: ['full_game_appid', 'demo_appid', 'name']
DLC File Columns: ['base_appid', 'dlc_appid', 'name']


### Standardize column names

In [248]:
demo_df.rename(columns={"full_game_appid": "appid"}, inplace=True)
dlc_df.rename(columns={"base_appid": "appid"}, inplace=True)

print("\nAfter Standardizing Column Names:\n")
print("Demo File Columns:", demo_df.columns.tolist())
print("DLC File Columns:", dlc_df.columns.tolist())


After Standardizing Column Names:

Demo File Columns: ['appid', 'demo_appid', 'name']
DLC File Columns: ['appid', 'dlc_appid', 'name']


### Handle missing values

In [249]:
original_len_demo = len(demo_df)
original_len_dlc = len(dlc_df)
demo_df.dropna(inplace=True)
dlc_df.dropna(inplace=True)

print(f"Dropped {original_len_demo - len(demo_df)} rows in Demo DataFrame due to missing values.")
print(f"Remaining rows in Demo DataFrame: {len(demo_df)}\n")
print(f"Dropped {original_len_dlc - len(dlc_df)} rows in DLC DataFrame due to missing values.")
print(f"Remaining rows in DLC DataFrame: {len(dlc_df)}")

Dropped 1 rows in Demo DataFrame due to missing values.
Remaining rows in Demo DataFrame: 15448

Dropped 0 rows in DLC DataFrame due to missing values.
Remaining rows in DLC DataFrame: 5445


### Convert data types

- errors="coerce" means that if a value cannot be converted to a number **"id is not a number"** it's replaced by NaN
- It then gets removed by dropna() function

In [250]:
original_len_demo = len(demo_df)
original_len_dlc = len(dlc_df)

for df, appid_col in [(demo_df, "demo_appid"), (dlc_df, "dlc_appid")]:
    df[appid_col] = pd.to_numeric(df[appid_col], errors="coerce") 
    df["appid"] = pd.to_numeric(df["appid"], errors="coerce")

    # Drop rows where appid or demo/dlc_appid is missing/NaN
    df.dropna(subset=[appid_col, "appid"], inplace=True)

    df[appid_col] = df[appid_col].astype(int)
    df["appid"] = df["appid"].astype(int)
    df["name"] = df["name"].astype(str)

print(f"Dropped {original_len_demo - len(demo_df)} rows in Demo DataFrame due to missing values.")
print(f"Remaining rows in Demo DataFrame: {len(demo_df)}\n")
print(f"Dropped {original_len_dlc - len(dlc_df)} rows in DLC DataFrame due to missing values.")
print(f"Remaining rows in DLC DataFrame: {len(dlc_df)}")


Dropped 1 rows in Demo DataFrame due to missing values.
Remaining rows in Demo DataFrame: 15447

Dropped 1 rows in DLC DataFrame due to missing values.
Remaining rows in DLC DataFrame: 5444


### Remove duplicates by unique identifier

- **demo_appid**: In the DEMOs file
- **dlc_appid**: In the DLCs file

In [251]:
demo_duplicates = demo_df.duplicated(subset=["demo_appid"]).sum()
dlc_duplicates = dlc_df.duplicated(subset=["dlc_appid"]).sum()
demo_df.drop_duplicates(subset=["demo_appid"], inplace=True)
dlc_df.drop_duplicates(subset=["dlc_appid"], inplace=True)
print(f"Dropped {demo_duplicates} duplicate rows in Demo DataFrame based on demo_appid.")
print(f"Dropped {dlc_duplicates} duplicate rows in DLC DataFrame based on dlc_appid.")

Dropped 0 duplicate rows in Demo DataFrame based on demo_appid.
Dropped 0 duplicate rows in DLC DataFrame based on dlc_appid.


### Clean the 'name' column

- By removing any leading or trailing whitespace from the 'name' column.
- By removing any non-alphanumeric characters from the 'name' column.
- By removing 'Demo' and 'DLC' from the 'name' column.

In [252]:
def clean_name_column(df, column_name):
    df[column_name] = (df[column_name]
                      .str.replace(r"(?i)\bdemo\b", "", regex=True)
                      .str.replace(r"(?i)\bdlc\b", "", regex=True)
                      .str.replace(r"\s+", " ", regex=True)
                      .str.strip()
                      .str.title())
    return df

demo_df = clean_name_column(demo_df, "name")
dlc_df = clean_name_column(dlc_df, "name")

### Reset Indicies

- Reset the indices of the dataframe to the original order.

In [253]:
demo_df.reset_index(drop=True, inplace=True)
dlc_df.reset_index(drop=True, inplace=True)

### Create indicator and count features

- Merged later with info_base_games dataframe

In [254]:
def create_indicator_features(df, group_by_col, count_col, feature_prefix):

    # group by appid to count the demo/dlc count for each game 
    indicators = df.groupby(group_by_col)[count_col].count().reset_index()
    
    # create has_feature column
    indicators[f'has_{feature_prefix}'] = 1

    # create count column
    indicators[f'{feature_prefix}_count'] = indicators[count_col]
    
    # e.g.: return demo_df[['appid', 'has_demo', 'demo_count']]  
    return indicators[[group_by_col, f'has_{feature_prefix}', f'{feature_prefix}_count']]

demo_indicators = create_indicator_features(demo_df, 'appid', 'demo_appid', 'demo')
dlc_indicators = create_indicator_features(dlc_df, 'appid', 'dlc_appid', 'dlc')

### Merge features with base_games_df

In [255]:
info_base_games_df = info_base_games_df.merge(demo_indicators, on="appid", how="left")
info_base_games_df = info_base_games_df.merge(dlc_indicators, on="appid", how="left")

### Fill NaN values in the new features:

- 0 for indicators and counts where no demo/DLC exists


In [256]:
info_base_games_df["has_demo"] = info_base_games_df["has_demo"].fillna(0).astype(int)
info_base_games_df["demo_count"] = info_base_games_df["demo_count"].fillna(0).astype(int)
info_base_games_df["has_dlc"] = info_base_games_df["has_dlc"].fillna(0).astype(int)
info_base_games_df["dlc_count"] = info_base_games_df["dlc_count"].fillna(0).astype(int)

### Save the preprocessed data

- Used to save the preprocessed data to a file for testing and validation purposes.
- It's also used to add 4 features to the info_base_games: 
    
    - dlc count
    - demo count
    - dlc indicator
    - demo indicator
    

In [257]:
# demo_df.to_csv(DEMO_OUTPUT_PATH, index=False)
# dlc_df.to_csv(DLC_OUTPUT_PATH, index=False)
# info_base_games_df.to_csv(BASE_GAMES_OUTPUT_PATH, index=False)

### Statistics After Preprocessing

In [258]:
demo_df.info()
dlc_df.info()
info_base_games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15447 entries, 0 to 15446
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   appid       15447 non-null  int64 
 1   demo_appid  15447 non-null  int64 
 2   name        15447 non-null  object
dtypes: int64(2), object(1)
memory usage: 362.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5444 entries, 0 to 5443
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   appid      5444 non-null   int64 
 1   dlc_appid  5444 non-null   int64 
 2   name       5444 non-null   object
dtypes: int64(2), object(1)
memory usage: 127.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99167 entries, 0 to 99166
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   appid                99167 non-null  object
 1   name                 99149

### Summary

In [259]:
print("\nPreprocessing Summary:\n")
print(f"Demo DataFrame: {len(demo_df)} rows, {len(demo_df.columns)} columns")
print(f"DLC DataFrame: {len(dlc_df)} rows, {len(dlc_df.columns)} columns\n")
print(f"Info_base_games DataFrame: {len(info_base_games_df)} rows, {len(info_base_games_df.columns)} columns\n")

print(dlc_df.head())
print("\n")
print(demo_df.head())
print("\n")
print(info_base_games_df.head())


Preprocessing Summary:

Demo DataFrame: 15447 rows, 3 columns
DLC DataFrame: 5444 rows, 3 columns

Info_base_games DataFrame: 99167 rows, 14 columns

     appid  dlc_appid                                             name
0  1786750    2568660                        家出王女 - 全年齢版ストーリー&グラフィック追加
1  1981700    2563730                           Jacob'S Quest - Voyage
2  2009450    2552980  Invector: Rhythm Galaxy - Latin Power Song Pack
3  1133420    2550750         Hero Or Villain: Genesis — Supercharged!
4  2533950    2551000            Hot And Lovely ：Uniform - Adult Patch


     appid  demo_appid                    name
0  2214650     2573370          Rolando Deluxe
1  1439980     2573460  Outrunner: Neon Nights
2  2412240     2572840     Bubble Ghost Remake
3  2448830     2572240           Time Handlers
4  2379590     2570800    Hope'S Final Defense


     appid                      name metacritic  steam_achievements  \
0  2574000            Femboy Burgers        NaN                Tru

### Conclusion

The 2 csv files contain the following :

##### 1. Demos

- contains the demo name, demo id and the main game id

##### 2. Dlcs

- contains the dlc name, dlc id and the main game id

#### Some Notes

- I didn't scale or normalize the values of the ids in both csv files because they are identifiers.
- I normalized all the column names, removed missing value and converted the data types of the columns to the appropriate ones.
- I removed the duplicate rows from both csv files based on the 'id' column (e.g., "demo_appid").
- I normalized the 'name' column in both csv files by removing the leading and trailing whitespaces.
- I also removed the 'Demo' and 'DLC' from the 'name' column in both csv files.
- I added the code for adding the 4 features requested.
- Any comments are for either for testing my part or for creating and merging the new 4 features used in the info_base_games csv

#### What I think About the Data

- I think the 2 csv files' columns won't be useful or have any indication of the game's popularity or how many copies it sold; they are more like metadata.