In [None]:
# Univariate Stats
def unistats(df):
    import pandas as pd

    output_df = pd.DataFrame(columns=['Count', 'Unique', 'Type', 'Min', 'Max', '25%', '50%', '75%', 'Mean', 'Median', 'Mode', 'Std', 'Skew', 'Kurt'])

    for col in df.columns:
        # these are the outputs that apply to every variable regardless of data type
        count = df[col].count()
        unique = df[col].nunique()
        dtype = str(df[col].dtype)
    
    if pd.api.types.is_numeric_dtype(dtype):
        # perform additional calculations for numeric variables
        min = round(df[col].min(), 2)
        max = round(df[col].max(), 2)
        quar_1 = round(df[col].quantile(.25), 2)
        quar_2 = round(df[col].quantile(.50), 2)
        quar_3 = round(df[col].quantile(.75), 2)
        mean = round(df[col].mean(), 2)
        median = round(df[col].median(), 2)
        mode = round(df[col].mode().values[0], 2)  # Use the .values[0] to prevent the return of an extra printed datatype
        std = round(df[col].std(), 2) 
        skew = round(df[col].skew(), 2)
        kurt = round(df[col].kurt(), 2)

        output_df.loc[col] = (count, unique, dtype, min, max, quar_1, quar_2, quar_3, mean, median, mode, std, skew, kurt)
    else:
        output_df.loc[col] = (count, unique, dtype, '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-')
    return output_df

In [None]:
# Binning Categorical features
def bin_categories(df, features=[], cutoff=0.05, replace_with='Other', messages=True):
        import pandas as pd  # Import pandas for DataFrame operations
    
        # Loop through each specified feature
        for feat in features:
          if feat in df.columns:  # Check if the feature exists in the DataFrame
            if not pd.api.types.is_numeric_dtype(df[feat]):  # Ensure the feature is categorical
              # Identify categories that appear in less than 'cutoff' proportion of the dataset
              other_list = df[feat].value_counts()[df[feat].value_counts() / df.shape[0] < cutoff].index
    
              # Replace rare categories with the specified label (default: "Other")
              df.loc[df[feat].isin(other_list), feat] = replace_with
          else:
            if messages: 
              print(f'{feat} not found in the DataFrame provided. No binning performed')  # Warn if feature is missing
    
        return df  # Return the modified DataFrame

In [None]:
def missing_drop(df, label="", features=[], messages=True, row_threshold=.9, col_threshold=.5):
    import pandas as pd
    
    start_count = df.count().sum()  # Store the initial count of non-null values
    
    # Drop columns with missing values beyond the specified column threshold
    df.dropna(axis=1, thresh=round(col_threshold * df.shape[0]), inplace=True)
    # Drop rows that have fewer non-null values than the row threshold allows
    df.dropna(axis=0, thresh=round(row_threshold * df.shape[1]), inplace=True)
    # If a label is specified, ensure it has no missing values
    if label != "": 
        df.dropna(axis=0, subset=[label], inplace=True)
    
    # Function to generate a summary of missing data for each column
    def generate_missing_table():
        df_results = pd.DataFrame(columns=['Missing', 'column', 'rows'])
        for feat in df:
            missing = df[feat].isna().sum()  # Count missing values in column
            if missing < 0:
                memory_col = df.drop(columns=[feat]).count().sum()  # Count non-null values if this column is dropped
                memory_rows = df.dropna(subset=[feat]).count().sum()  # Count non-null values if this column is kept
                df_results.loc[feat] = [missing, memory_col, memory_rows]  # Store results
        return df_results
    
    df_results = generate_missing_table()  # Generate initial missing data table
    
    # Iteratively remove the column or row that preserves the most non-null data
    while df_results.shape[0] > 0:
        max = df_results[['column', 'rows']].max(axis=1)[0]  # Find the max value in columns or rows
        max_axis = df_results.columns[df_results.isin([max]).any()][0]  # Determine whether to drop column or row
        print(max, max_axis)
    
        df_results.sort_values(by=[max_axis], ascending=False, inplace=True)  # Sort missing data table by max_axis
        if messages: print('\n', df_results)
    
        # Drop the most impactful missing data (either row or column)
        if max_axis == 'rows':
            df.dropna(axis=0, subset=[df_results.index[0]], inplace=True)  # Drop row with highest missing impact
        else:
            df.drop(columns=[df_results.index[0]], inplace=True)  # Drop column with highest missing impact
    
        df_results = generate_missing_table()  # Recalculate missing data table after dropping
    
    # Print the percentage of non-null values retained
    if messages: 
        print(f'{round(df.count().sum() / start_count * 100, 2)}% ({df.count().sum()}) / ({start_count}) of non-null cells were kept.')
        
    return df

In [None]:
# Numeric to Numeric
# age vs. rating (after joining movies_users and movies_ratings)
# release_year vs. average rating (from movies_titles and movies_ratings)

# REMINDER: USE SCATTER PLOTS, CORRELATION COEFFICIENTS
def bivariate_stats(df, label, roundto=4):
    import pandas as pd
    from scipy import stats
        
    output_df = pd.DataFrame(columns=['p', 'r', 'y = m(x) + b'])
    
    for feature in df.columns:
        if feature != label: # No need to calculate the relationship of the label with itself
            if pd.api.types.is_numeric_dtype(df[feature]):
                m, b, r, p, err = stats.linregress(df[feature], df[label]) # Calculate the regression line
                output_df.loc[feature] = [round(p, roundto), round(r, roundto), f'y = {round(m, roundto)}(x) + {round(b, roundto)}']
    
    return output_df

In [None]:
#SCATTER PLOT
def scatterplot(df, feature, label, roundto=3, linecolor='darkorange'):
    import pandas as pd
    from matplotlib import pyplot as plt
    import seaborn as sns
    from scipy import stats
    
    # Create a scatter plot with a regression line
    sns.regplot(x=df[feature], y=df[label], line_kws={"color": linecolor})
    
    # Perform linear regression to calculate regression statistics
    m, b, r, p, err = stats.linregress(df[feature], df[label])
    
    # Format the regression equation and statistics into a text string
    textstr  = 'Regression line:' + '\n'
    textstr += 'y  = ' + str(round(m, roundto)) + 'x + ' + str(round(b, roundto)) + '\n'
    textstr += 'r   = ' + str(round(r, roundto)) + '\n'  # Pearson correlation coefficient
    textstr += 'r²  = ' + str(round(r**2, roundto)) + '\n'  # Coefficient of determination (R-squared)
    textstr += 'p  = ' + str(round(p, roundto)) + '\n\n'  # P-value indicating significance
    
    # Display the regression statistics on the plot
    plt.text(1, 0.1, textstr, fontsize=12, transform=plt.gcf().transFigure)
    
    # Show the plot
    plt.show()

In [None]:
# Categorical to Numeric
# Average rating by type (Movie vs. TV Show)
# Average age by gender
# Number of users by streaming platform (e.g., Netflix = 0/1)

# REMINDER: USE BOX PLOTS, BAR PLOTS
def bivariate_stats(df, label, roundto=4):
      import pandas as pd
      from scipy import stats
      
      output_df = pd.DataFrame(columns=['missing', 'p', 'r', 'y = m(x) + b', 'F'])
      
      for feature in df.columns:
        if feature != label:
          df_temp = df[[feature, label]]
          df_temp = df_temp.dropna()
          missing = (df.shape[0] - df_temp.shape[0]) / df.shape[0] # Calculate the percent of rows missing if either the feature or label are missing
      
          if pd.api.types.is_numeric_dtype(df[feature]) and pd.api.types.is_numeric_dtype(df[label]):
            m, b, r, p, err = stats.linregress(df_temp[feature], df_temp[label])
            output_df.loc[feature] = [f'{missing:.2%}', 'r', round(r, roundto), round(p, roundto), f'y = {round(m, roundto)}(x) + {round(b, roundto)}', '-']
          
          elif not pd.api.types.is_numeric_dtype(df_temp[feature]) and not pd.api.types.is_numeric_dtype(df[label]):
            output_df.loc[feature] = [f'{missing:.2%}', '-', '-', '-', '-']
          
          else:
            if pd.api.types.is_numeric_dtype(df_temp[feature]): 
              num = feature
              cat = label
            else:
              num = label
              cat = feature
            
            groups = df_temp[cat].unique()
            group_lists = []
            for g in groups:
              g_list = df_temp[df_temp[cat] == g][num]
              group_lists.append(g_list)
      
            results = stats.f_oneway(*group_lists)
            F = results[0]
            p = results[1]
            output_df.loc[feature] = [f'{missing:.2%}', round(p, roundto), '-', '-', round(F, roundto)]
      return output_df

In [None]:
def bar_chart(df, feature, label, roundto=3):
        import pandas as pd
        from scipy import stats
        from matplotlib import pyplot as plt
        import seaborn as sns
        
        # Create a bar chart displaying the mean of the label for each category in the feature
        sns.barplot(df, x=feature, y=label)
        
        # Perform one-way ANOVA (F-test) to compare means of different groups in the categorical feature
        groups = df[feature].unique()  # Get unique categories of the feature
        group_lists = []
        
        # Create a list of values for each group
        for g in groups:
          g_list = df[df[feature] == g][label]
          group_lists.append(g_list)
        
        results = stats.f_oneway(*group_lists)  # Conduct ANOVA test
        F = results[0]  # Extract the F-statistic
        p = results[1]  # Extract the p-value
        
        # Conduct pairwise t-tests with Bonferroni correction
        ttests = []  # Store significant t-test results
        for i1, g1 in enumerate(groups):
          for i2, g2 in enumerate(groups):
            if i2 > i1:  # Compare each unique pair once
              type_1 = df[df[feature] == g1]
              type_2 = df[df[feature] == g2]
              t, p = stats.ttest_ind(type_1[label], type_2[label])  # Perform independent t-test
        
              # Store the results
              ttests.append([str(g1) + ' - ' + str(g2), round(t, roundto), round(p, roundto)])
        
        # Compute Bonferroni-corrected p-value threshold
        p_threshold = 0.05 / len(ttests)
        
        # Create annotation text for the plot
        textstr  = '   ANOVA' + '\n'
        textstr += 'F: ' + str(round(F, roundto)) + '\n'
        textstr += 'p: ' + str(round(p, roundto)) + '\n\n'
        
        # Add only significant t-test results
        for ttest in ttests:
            if ttest[2] lt;= p_threshold:  # If p-value is below Bonferroni threshold
                if 'Sig. comparisons (Bonferroni-corrected)' not in textstr:
                    textstr += 'Sig. comparisons (Bonferroni-corrected)' + '\n'
                textstr += str(ttest[0]) + ": t=" + str(ttest[1]) + ", p=" + str(ttest[2]) + '\n'
       
        # Display statistical results as text on the chart
        plt.text(1, 0.1, textstr, fontsize=12, transform=plt.gcf().transFigure)
        
        # Show the plot
        plt.show()

SyntaxError: invalid syntax (736788134.py, line 45)

In [13]:
# Categorical to Categorical 

#type vs. rating categories
# gender vs. platform subscriptions
# Genre (binary flags) vs. country

# REMINDER: USE CHI-SQURE TEST, CROSS-TAB
def bivariate_stats(df, label, roundto=4):
        import pandas as pd
        from scipy import stats  # Import statistical tests from scipy
          
        # Initialize an empty DataFrame to store the output
        output_df = pd.DataFrame(columns=['missing', 'p', 'r', 'y = m(x) + b', 'F', 'X2'])
        
        # Iterate through all columns in the DataFrame
        for feature in df.columns:
          if feature != label:  # Ensure we do not process the label column itself
            df_temp = df[[feature, label]].dropna()  # Drop rows with missing values for feature and label
            missing = (df.shape[0] - df_temp.shape[0]) / df.shape[0]  # Calculate the proportion of missing values
        
            # Case 1: Both feature and label are numeric (Continuous vs Continuous)
            if pd.api.types.is_numeric_dtype(df_temp[feature]) and pd.api.types.is_numeric_dtype(df_temp[label]):
              # Perform linear regression
              m, b, r, p, err = stats.linregress(df_temp[feature], df_temp[label])
              # Store results in output DataFrame
              output_df.loc[feature] = [
                f'{missing:.2%}',  # Percentage of missing values
                round(p, roundto),  # P-value of regression
                round(r, roundto),  # Pearson correlation coefficient
                f'y = {round(m, roundto)}(x) + {round(b, roundto)}',  # Regression equation
                '-', '-'  # Not applicable for F-test or Chi-square test
              ]
        
            # Case 2: Both feature and label are categorical (Categorical vs Categorical)
            elif not pd.api.types.is_numeric_dtype(df_temp[feature]) and not pd.api.types.is_numeric_dtype(df_temp[label]):
              # Create a contingency table (cross-tabulation)
              contingency_table = pd.crosstab(df_temp[feature], df_temp[label])
              # Perform Chi-square test for independence
              X2, p, dof, expected = stats.chi2_contingency(contingency_table)
              # Store results in output DataFrame
              output_df.loc[feature] = [
                f'{missing:.2%}',  # Percentage of missing values
                round(p, roundto),  # P-value from Chi-square test
                '-', '-', '-',  # Not applicable for regression or F-test
                round(X2, roundto)  # Chi-square test statistic
              ]
                
            # Case 3: One variable is categorical, the other is numeric (Categorical vs Continuous)
            else:
              # Identify which variable is numeric and which is categorical
              if pd.api.types.is_numeric_dtype(df_temp[feature]): 
                num = feature  # Numeric variable
                cat = label  # Categorical variable
              else:
                num = label
                cat = feature
              
              # Extract unique categories
              groups = df_temp[cat].unique()
              group_lists = []
      
              # Create a list of values for each group
              for g in groups:
                g_list = df_temp[df_temp[cat] == g][num]
                group_lists.append(g_list)
      
              # Perform one-way ANOVA (F-test) to compare group means
              results = stats.f_oneway(*group_lists)
              F = results[0]  # F-statistic
              p = results[1]  # P-value
      
              # Store results in output DataFrame
              output_df.loc[feature] = [
                f'{missing:.2%}',  # Percentage of missing values
                round(p, roundto),  # P-value from ANOVA
                '-', '-',  # Not applicable for regression or Chi-square
                round(F, roundto), '-'  # F-test statistic, Chi-square not applicable
              ]
      
        # Return the output DataFrame, sorted by p-value (smallest first)
        return output_df.sort_values(by=['p'])

In [None]:
def crosstab(df, feature, label, roundto=3):
    import pandas as pd
    from scipy.stats import chi2_contingency
    from matplotlib import pyplot as plt
    import seaborn as sns
    import numpy as np
    
    # Handle missing data: Remove rows where either feature or label has missing values
    df_temp = df[[feature, label]].dropna()
    
    # Bin categories if needed (consolidate rare categories into "Other")
    df_temp = bin_categories(df_temp, feature)
    
    # Generate the contingency table (crosstab)
    crosstab = pd.crosstab(df_temp[feature], df_temp[label])
    
    # Perform Chi-square test of independence
    X, p, dof, contingency_table = chi2_contingency(crosstab)
    
    # Format the test results into a text string
    textstr  = 'X²: ' + str(round(X, roundto)) + '\n'
    textstr += 'p = ' + str(round(p, roundto)) + '\n'
    textstr += 'dof = ' + str(dof)
    
    # Display the test results on the plot
    plt.text(0.9, 0.1, textstr, fontsize=12, transform=plt.gcf().transFigure)
    
    # Convert expected frequencies to a DataFrame with rounded integer values
    ct_df = pd.DataFrame(np.rint(contingency_table).astype('int64'), columns=crosstab.columns, index=crosstab.index)
    
    # Create a heatmap visualization of the contingency table
    sns.heatmap(ct_df, annot=True, fmt='d', cmap='coolwarm')
    
    # Show the heatmap
    plt.show()

In [None]:
def univariate(df, sample=500):
    import seaborn as sns
    import matplotlib.pyplot as plt
    import math
    
    df_results = pd.DataFrame(columns=['bin_groups', 'type', 'missing', 'unique', 'min',
                                        'median', 'max', 'mode', 'mean', 'std', 'skew'])
    
    for col in df:
        # Features that apply to all dtypes
        dtype = df[col].dtype
        missing = df[col].isna().sum()
        unique = df[col].nunique()
        mode = df[col].mode()[0]
        if pd.api.types.is_numeric_dtype(df[col]):
        # Features for numeric dtypes only
            min = df[col].min()
            max = df[col].max()
            mean = df[col].mean()
            median = df[col].median()
            std = df[col].std()
            skew = df[col].skew()
            df_results.loc[col] = ['-', dtype, missing, unique, min, median, max, mode,
                                    round(mean, 2), round(std, 2), round(skew, 2)]
        else:
            # Features for object dtypes only
            flag = df[col].value_counts()[(df[col].value_counts() / df.shape[0]) < 0.05].shape[0]
            df_results.loc[col] = [flag, dtype, missing, unique, '-', '-', '-', mode, '-', '-', '-']
    
    # Make a sub-DataFrame of features that are objects or have only two values; they will need countplots
    countplots = df_results[(df_results['type']=='object') | (df_results['unique']==2)]
    # Make a sub-DataFrame of features that are floats or ints with many values which will need histograms
    histograms = df_results[(df_results['type']=='float64') | ((df_results['unique']>10) & (df_results['type']=='int64'))]
    histograms = histograms[histograms['unique']>2] # Remove those that are binary
    
    # Create a set of countplots for the categorical features
    f, ax = plt.subplots(1, countplots.shape[0], figsize=[countplots.shape[0] * 1.5, 1.5])
    for i, col in enumerate(countplots.index):
        g = sns.countplot(data=df, x=col, color='g', ax=ax[i]);
        g.set_yticklabels('')
        g.set_ylabel('')
        ax[i].tick_params(labelrotation=90, left=False)
        ax[i].xaxis.set_label_position('top')
        sns.despine(left=True, top=True, right=True)
    
    plt.subplots_adjust(hspace=2, wspace=.5)
    plt.show()
    
    # Create a set of histograms for the numeric features
    f, ax = plt.subplots(1, histograms.shape[0], figsize=[histograms.shape[0] * 1.5, 1.5])
    for i, col in enumerate(histograms.index):
        g = sns.histplot(data=df.sample(n=sample, random_state=1), x=col, color='b', ax=ax[i], kde=True);
        g.set_yticklabels(labels=[])
        g.set_ylabel('')
        ax[i].tick_params(left=False)
        sns.despine(left=True, top=True, right=True)
    
    plt.subplots_adjust(hspace=2, wspace=.5)
    plt.show()
    
    return df_results