In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from cudf import DataFrame as cudf
import cupy as cp  # For GPU acceleration
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from xgboos

In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

train = train.drop(columns=['id'])

train_gpu = cudf(train)
test_gpu = cudf(test)

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Sex         750000 non-null  object 
 1   Age         750000 non-null  int64  
 2   Height      750000 non-null  float64
 3   Weight      750000 non-null  float64
 4   Duration    750000 non-null  float64
 5   Heart_Rate  750000 non-null  float64
 6   Body_Temp   750000 non-null  float64
 7   Calories    750000 non-null  float64
dtypes: float64(6), int64(1), object(1)
memory usage: 45.8+ MB


In [11]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,750000.0,41.420404,15.175049,20.0,28.0,40.0,52.0,79.0
Height,750000.0,174.697685,12.824496,126.0,164.0,174.0,185.0,222.0
Weight,750000.0,75.145668,13.982704,36.0,63.0,74.0,87.0,132.0
Duration,750000.0,15.421015,8.354095,1.0,8.0,15.0,23.0,30.0
Heart_Rate,750000.0,95.483995,9.449845,67.0,88.0,95.0,103.0,128.0
Body_Temp,750000.0,40.036253,0.779875,37.1,39.6,40.3,40.7,41.5
Calories,750000.0,88.282781,62.395349,1.0,34.0,77.0,136.0,314.0


In [12]:
train['Sex'].value_counts()

Sex
female    375721
male      374279
Name: count, dtype: int64

In [None]:
# Add smooth KDE lines to existing distribution plots
numerical_features = train.select_dtypes(include=[np.number]).columns.tolist()

# Create a new figure with the same structure
fig = make_subplots(
    rows=len(numerical_features), 
    cols=1,
    subplot_titles=[f"Distribution of {feature}" for feature in numerical_features],
    vertical_spacing=0.05,
    shared_xaxes=False
)

# Updated color map for better visualization
colors = {'male': 'blue', 'female': 'red'}

# For each numerical feature, create a histogram with KDE lines for each sex
for i, feature in enumerate(numerical_features):
    # Calculate optimal number of bins based on data range - keep on GPU for min/max only
    min_val = float(train_gpu[feature].min())
    max_val = float(train_gpu[feature].max())
    nbins = 30
    bin_width = (max_val - min_val) / nbins
    
    # Create bin edges
    edges = [min_val + j * bin_width for j in range(nbins + 1)]
    centers = [(edges[j] + edges[j+1])/2 for j in range(nbins)]
    
    # Create histograms and KDE for each sex
    for sex in ['male', 'female']:
        # Filter data by sex - work with pandas directly
        filtered = train[train['Sex'] == sex]
        
        # Get feature data directly from pandas DataFrame
        feature_data = filtered[feature].values
        
        # Try to use GPU-accelerated histogram calculation
        try:
            # Convert to CuPy array for GPU acceleration
            feature_data_cp = cp.asarray(feature_data)
            hist_cp, _ = cp.histogram(feature_data_cp, bins=edges)
            hist = cp.asnumpy(hist_cp)  # Convert back to numpy for plotting
        except:
            # Fall back to numpy if CuPy operation fails
            hist, _ = np.histogram(feature_data, bins=edges)
        
        # Add histogram trace
        fig.add_trace(
            go.Bar(
                x=centers,
                y=hist,
                name=sex,
                marker_color=colors[sex],
                opacity=0.5,  # Lower opacity to make the KDE line more visible
                showlegend=(i == 0),
                legendgroup=sex
            ),
            row=i+1, 
            col=1
        )
        
        # Generate KDE using scipy - directly from pandas data
        if len(feature_data) > 1:  # Ensure we have enough data points
            # Sample data if too large to improve KDE performance
            if len(feature_data) > 50000:
                sample_indices = np.random.choice(len(feature_data), 50000, replace=False)
                kde_data = feature_data[sample_indices]
            else:
                kde_data = feature_data
                
            # Generate KDE points
            kde_x = np.linspace(min_val, max_val, 1000)
            kde = stats.gaussian_kde(kde_data)
            kde_y = kde(kde_x) * len(feature_data) * bin_width  # Scale KDE to match histogram height
            
            # Add KDE line trace
            fig.add_trace(
                go.Scatter(
                    x=kde_x,
                    y=kde_y,
                    mode='lines',
                    line=dict(color=colors[sex], width=3),
                    name=f"{sex} KDE",
                    showlegend=(i == 0),
                    legendgroup=sex
                ),
                row=i+1,
                col=1
            )
    
    # Update layout for each subplot
    fig.update_xaxes(title_text="Value", row=i+1, col=1)
    fig.update_yaxes(title_text="Count", row=i+1, col=1)

# Update global layout
fig.update_layout(
    height=300 * len(numerical_features),
    width=900,
    title_text="Distribution of Numerical Features by Sex with KDE Curves",
    barmode='overlay',
    legend_title_text="Sex",
)

# Show the figure
fig.show()

In [None]:
# Create scatter plots for each numerical feature with Calories as y-axis

# Define the features to plot (excluding Calories which will be the y-axis)
features_to_plot = [f for f in numerical_features if f != 'Calories']

# Create a figure with subplots - one row per feature
fig = make_subplots(
    rows=len(features_to_plot), 
    cols=1,
    subplot_titles=[f"{feature} vs Calories" for feature in features_to_plot],
    vertical_spacing=0.05
)

# Color map for better visualization
colors = {'male': 'blue', 'female': 'red'}

# For each feature, create a scatter plot with trend lines
for i, feature in enumerate(features_to_plot):
    for sex in ['male', 'female']:
        # Filter data by sex - use pandas DataFrame directly
        filtered = train[train['Sex'] == sex]
        
        # Extract data directly from pandas DataFrame
        x_data_full = filtered[feature]
        y_data_full = filtered['Calories']
        
        # Sample data if too large (improves performance)
        sample_size = min(len(filtered), 10000)  # Limit to 10,000 points for better rendering
        if len(filtered) > sample_size:
            # Sample indices
            sample_indices = np.random.choice(len(filtered), sample_size, replace=False)
            x_data = x_data_full.iloc[sample_indices]
            y_data = y_data_full.iloc[sample_indices]
        else:
            # Use all data if less than sample size
            x_data = x_data_full
            y_data = y_data_full
        
        # Add scatter plot
        fig.add_trace(
            go.Scatter(
                x=x_data,
                y=y_data,
                mode='markers',
                marker=dict(
                    color=colors[sex],
                    opacity=0.5,
                    size=3
                ),
                name=sex,
                showlegend=(i == 0),  # Only show legend for the first feature
                legendgroup=sex
            ),
            row=i+1, 
            col=1
        )
        
        # Calculate trend line with GPU acceleration if possible
        try:
            # Try to use CuPy for GPU-accelerated linear regression
            x_cp = cp.asarray(x_data)
            y_cp = cp.asarray(y_data)
            z = cp.polyfit(x_cp, y_cp, 1)
            # Convert back to numpy for plotting
            z = cp.asnumpy(z)
            p = np.poly1d(z)
        except:
            # Fall back to numpy if CuPy fails
            z = np.polyfit(x_data, y_data, 1)
            p = np.poly1d(z)
        
        # Generate points for the trend line
        x_trend = np.linspace(float(x_data.min()), float(x_data.max()), 100)
        y_trend = p(x_trend)
        
        # Add trend line
        fig.add_trace(
            go.Scatter(
                x=x_trend,
                y=y_trend,
                mode='lines',
                line=dict(color=colors[sex], width=3),
                name=f"{sex} trend",
                showlegend=(i == 0),
                legendgroup=sex
            ),
            row=i+1,
            col=1
        )
    
    # Update layout for each subplot
    fig.update_xaxes(title_text=feature, row=i+1, col=1)
    fig.update_yaxes(title_text="Calories", row=i+1, col=1)

# Update global layout
fig.update_layout(
    height=300 * len(features_to_plot),
    width=900,
    title_text="Relationship between Numerical Features and Calories by Sex",
    legend_title_text="Sex"
)

# Show the figure
fig.show()