EDA done by Apurva Umredkar (50592382) using a new dataset

How does online gaming affect the mental health of a person?
The dataset for this problem was downloaded from Kaggle: https://www.kaggle.com/datasets/divyansh22/online-gaming-anxiety-data

In [None]:
import requests
import zipfile
import io
import os
import pandas as pd

url = 'https://storage.googleapis.com/kaggle-data-sets/820200/1403222/compressed/GamingStudy_data.csv.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20241008%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241008T143813Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=1598505283cac2bfb3360235fe96f97ff4c110c3b2a70d1e04aa5c96f5d6ba3b7270ee85e73d7901c33feb8b3f872e9e212f9076f180a089c899c32cb879677885c9e55f22e2ea3f30a0dcd288ba9d759ef254388b2d752888679809085bb057c1f152ba976260333205e16131c02ab715ce1a2cc9b0fa06cf06206ed967fae11fdefbe37c30a9574fad339b2a83213ac9ef1400bd17a2415884d71c577e03afdf821478cfd03449d8fbce3779f83a8b323adf448995e59c37d8704a9327ad8614074685b89c149ca6cda7d4cd7c7c31fec916383d659745ac7f88f5786ed28bbbf2f1e0fd4e405765906c40239659f8aae9b044339ceed46aece7376b990210'

# Download the file
response = requests.get(url)

# Check if the download was successful
if response.status_code == 200:
    # Create a file-like object from the response content
    zip_file = zipfile.ZipFile(io.BytesIO(response.content))
    
    # Iterate over the files in the zip archive
    for file_name in zip_file.namelist():
        with zip_file.open(file_name) as extracted_file:
            # Read the content of the extracted file
            gaming_dat = pd.read_csv(extracted_file, encoding='ISO-8859-1')
            
    print('File downloaded and extracted successfully!')
else:
    print('Using the file stored locally in the folder named data')
    gaming_dat = pd.read_csv(r"./data/gamedata.csv", encoding='ISO-8859-1')

gaming_dat.head()

In [None]:
# dataset size
gaming_dat.shape, gaming_dat.columns

In [None]:
# data cleaning 1: s.no and timestamp are irrelevant, dropping these columns
gaming_dat = gaming_dat.drop(columns = ["S. No.", "Timestamp"], axis = 1)
gaming_dat.head()

In [None]:
# EDA 1: Count of ages
import matplotlib.pyplot as plt
import numpy as np
age_count = gaming_dat['Age'].value_counts().reset_index()
plt.bar(age_count["Age"], age_count["count"])
plt.title("Studying the age of gamers")
plt.xlabel("Age")
plt.ylabel("# of gamers")
plt.show()

mean_age = np.mean(gaming_dat["Age"])
print(f"Mean age of the gamers: {np.round(mean_age,0):.0f}")

EDA 1: Gamers from which country are more anxious?

In [None]:
import pandas as pd

# Step 1: Calculate total GAD score per participant
gaming_dat['GAD_total'] = gaming_dat[['GAD1', 'GAD2', 'GAD3', 'GAD4', 'GAD5', 'GAD6', 'GAD7']].sum(axis=1)

# Step 2: Group by country and calculate average GAD score
country_anxiety = gaming_dat.groupby('Residence_ISO3')['GAD_total'].mean()

# Step 3: Sort countries by average GAD score
country_anxiety_sorted = country_anxiety.sort_values(ascending=False).reset_index()

print(f"Based on online gaming data\nCountry with highest average anxiety: {country_anxiety_sorted.loc[0].Residence_ISO3} - {country_anxiety_sorted.loc[0].GAD_total}")
print(f"Country with lowest average anxiety: {country_anxiety_sorted.iloc[-1].Residence_ISO3} - {country_anxiety_sorted.iloc[-1].GAD_total} ")

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# from the dataset source, we have the following descriptsion for the GAD columns
# Statements mapped to their corresponding column names
statements = {
    'GAD1': 'Feeling nervous, anxious, or on edge',
    'GAD2': 'Not being able to stop or control worrying',
    'GAD3': 'Worrying too much about different things',
    'GAD4': 'Trouble relaxing',
    'GAD5': 'Being so restless that it`s hard to sit still',
    'GAD6': 'Becoming easily annoyed or irritable',
    'GAD7': 'Feeling afraid as if something awful might happen'
}

statement_columns = list(statements.keys())
statement_labels = list(statements.values())

In [None]:
gaming_dat_copy = gaming_dat.copy()

# Creating bins for age
age_bins = [18, 20, 25, 30, 100]
age_labels = ['18-20', '20-25', '25-30', '30+']
gaming_dat_copy['AgeBin'] = pd.cut(gaming_dat_copy.Age, age_bins, labels=age_labels)

# Creating bins for hours played per week
hour_bins = [0, 20, 40, 60, 80, 120]
hour_labels = ['0-20', '20-40', '40-60', '60-80', '80+']
gaming_dat_copy['HoursBin'] = pd.cut(gaming_dat_copy.Hours, hour_bins, labels=hour_labels)

# Prepare data for radar charts
def prepare_radar_data(df):
    radar_data = df.groupby(['AgeBin', 'HoursBin'])[statement_columns].mean().reset_index()
    return radar_data

radar_data = prepare_radar_data(gaming_dat_copy)

In [None]:
# Create subplots for each age zone
fig = make_subplots(rows=len(age_labels), cols=1,
                    subplot_titles=age_labels,
                    specs=[[{'type': 'polar'}] for _ in age_labels])

# Iterate over each age zone and add radar plot
for idx, age_zone in enumerate(age_labels):
    age_data = radar_data[radar_data['AgeBin'] == age_zone]
    for _, row in age_data.iterrows():
        fig.add_trace(go.Scatterpolar(
            r=row[statement_columns].values,
            theta=statement_labels,
            fill='toself',
            name=f"{row['AgeBin']} | {row['HoursBin']}"
        ), row=idx+1, col=1)

fig.update_layout(
    polar=dict(
        radialaxis=dict(visible=True, showticklabels=False)
    ),
    showlegend=True,
    title="Radar Chart for Psychological Well-being by Age and Hours Played",
    height=2000,  # Set the height of the entire figure
    width=1000    # Set the width of the entire figure
)
fig.show( )

Conclusion: From the radar charts, it can be seen that young gamers from the age group 18-25 suffer from more anxiety.