In [1]:
!pip install -q streamlit
!pip install pmdarima



In [2]:
# Install Kaggle API
!pip install kaggle



In [3]:
# Upload kaggle.json
from google.colab import files
uploaded = files.upload()  # This will prompt you to upload the kaggle.json file


Saving kaggle.json to kaggle (2).json


In [4]:
import os
import json

# Load the JSON file
with open('kaggle.json', 'r') as f:
    kaggle_creds = json.load(f)
    os.environ['KAGGLE_USERNAME'] = kaggle_creds['username']
    os.environ['KAGGLE_KEY'] = kaggle_creds['key']


In [5]:
!kaggle datasets download -d anlgrbz/student-demographics-online-education-dataoulad

! unzip "student-demographics-online-education-dataoulad.zip"

Dataset URL: https://www.kaggle.com/datasets/anlgrbz/student-demographics-online-education-dataoulad
License(s): Attribution 4.0 International (CC BY 4.0)
student-demographics-online-education-dataoulad.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  student-demographics-online-education-dataoulad.zip
replace assessments.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [56]:
%%writefile app.py

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import io
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import plot_tree
from sklearn.metrics import mean_squared_error , mean_absolute_error, r2_score
from pmdarima.arima.utils import nsdiffs
from pmdarima import auto_arima
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from xgboost import XGBRegressor
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt


st.set_page_config(
    page_title='Open University Learning Analytics Dataset (OULAD) Analysis',
    page_icon='📊'
)

# -------------------------------------Functions--------------------------------------------
# ----------------------------------- -Defination--------------------------------------------



@st.cache_data
def load_data():
    assesment = pd.read_csv('/content/assessments.csv')
    course = pd.read_csv('/content/courses.csv')
    as_stu = pd.read_csv('/content/studentAssessment.csv')
    info_stu = pd.read_csv('/content/studentInfo.csv')
    reg_stu = pd.read_csv('/content/studentRegistration.csv')
    vle_stu = pd.read_csv('/content/studentVle.csv')
    vle = pd.read_csv('/content/vle.csv')
    return assesment, course, as_stu, info_stu, reg_stu, vle_stu, vle

assesment, course, as_stu, info_stu, reg_stu, vle_stu, vle = load_data()

def missingValueAssessment(data):
    # Display data information
    st.title("Missing Value Assessment")

    st.write("## Data Information")
    buffer = io.StringIO()
    data.info(buf=buffer)
    s = buffer.getvalue()
    st.text(s)

    st.write("### Dataframe Shape")
    st.write(f"**Rows**: {data.shape[0]}, **Columns**: {data.shape[1]}")

    st.write("----------------------------------------------------------------------------------")

    # Calculate missing values
    missing_values = data.isnull().sum()
    missing_values_percentage = (missing_values / len(data)) * 100

    # DataFrame to display missing values
    missing_values_df = pd.DataFrame({
        'Column': missing_values.index,
        'Missing Values': missing_values.values,
        'Percentage': missing_values_percentage
    }).sort_values(by='Missing Values', ascending=False)

    # Display missing values in a table
    st.write("### Missing Values")
    st.dataframe(missing_values_df)

    # Plot missing values
    st.write("### Missing Values Bar Chart")
    fig = px.bar(missing_values_df, x='Column', y='Missing Values',
                 title='Missing Values by Column',
                 labels={'Missing Values': 'Count of Missing Values'},
                 color='Missing Values',
                 color_continuous_scale=px.colors.sequential.Viridis)

    fig.update_layout(xaxis_tickangle=-45)
    st.plotly_chart(fig)

    # Display missing values percentage
    st.write("### Missing Values Percentage")
    st.dataframe(missing_values_df[['Column', 'Percentage']])

    # Plot missing values percentage
    st.write("### Missing Values Percentage Bar Chart")
    fig_percentage = px.bar(missing_values_df, x='Column', y='Percentage',
                            title='Percentage of Missing Values by Column',
                            labels={'Percentage': 'Percentage of Missing Values'},
                            color='Percentage',
                            color_continuous_scale=px.colors.sequential.Viridis)

    fig_percentage.update_layout(xaxis_tickangle=-45)
    st.plotly_chart(fig_percentage)

    st.write("----------------------------------------------------------------------------------")


    # Display the missing values
    st.write("### Missing Values DataFrame")
    st.write(missing_values_df)

    st.write("----------------------------------------------------------------------------------")

    # Visualize the missing values
    st.write("### Missing Values Visualization")
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Percentage', y='Column', data=missing_values_df.sort_values(by='Percentage', ascending=False))
    plt.title('Percentage of Missing Values by Column')
    plt.xlabel('Percentage of Missing Values')
    plt.ylabel('Columns')
    st.pyplot(plt)

#Exploratory data analysis
def plotActivityCounts(data):
    #1. Compute the activity counts
    st.subheader('Count of Each Activity Type')
    activity_counts = data['activity_type'].value_counts().reset_index()
    activity_counts.columns = ['activity_type', 'count']

    # Sort the activity types by count in descending order
    activity_counts = activity_counts.sort_values(by='count', ascending=False)

    # Define a custom color sequence
    custom_colors = px.colors.qualitative.Pastel

    # bar chart with Plotly
    fig_activity = px.bar(activity_counts,
                          x='activity_type',
                          y='count',
                          title='Count of Each Activity Type',
                          labels={'activity_type': 'Activity Type', 'count': 'Count'},
                          text='count',
                          color='activity_type',
                          color_discrete_sequence=custom_colors)

    # Update layout for better appearance
    fig_activity.update_layout(
        xaxis_title='Activity Type',
        yaxis_title='Count',
        title={'x': 0.5, 'xanchor': 'center'},
        template='plotly_white',
        xaxis_tickangle=45  # Rotate x-axis labels for better readability
    )

    # Display the Plotly chart in Streamlit
    st.plotly_chart(fig_activity)

    #2. Counts of code_presentation for each code_module
    st.subheader('Counts of Code Presentations for Each Code Module')
    df_counts_module_presentation = data.groupby(['code_module', 'code_presentation']).size().reset_index(name='count')

    # stacked bar chart with Plotly
    fig_module_presentation = px.bar(df_counts_module_presentation,
                                     x='code_module',
                                     y='count',
                                     color='code_presentation',
                                     title='Counts of Code Presentations for Each Code Module',
                                     labels={'code_module': 'Code Module', 'count': 'Count', 'code_presentation': 'Code Presentation'},
                                     color_discrete_sequence=px.colors.sequential.Viridis,
                                     barmode='stack')

    # Update layout for better appearance
    fig_module_presentation.update_layout(
        xaxis_title='Code Module',
        yaxis_title='Count',
        title={'x': 0.5, 'xanchor': 'center'},
        template='plotly_white'
    )

    # Display the Plotly chart in Streamlit
    st.plotly_chart(fig_module_presentation)

    #3. Count of code_presentation
    st.subheader('Count of Code Presentations')
    df_counts_presentation = data['code_presentation'].value_counts().reset_index()
    df_counts_presentation.columns = ['code_presentation', 'count']

    # bar chart with Plotly
    fig_presentation = px.bar(df_counts_presentation,
                              x='count',
                              y='code_presentation',
                              title='Count of Code Presentations',
                              labels={'code_presentation': 'Code Presentation', 'count': 'Count'},
                              text='count',
                              color='code_presentation',
                              color_discrete_sequence=custom_colors,
                              orientation='h')

    # Update layout for better appearance
    fig_presentation.update_layout(
        xaxis_title='Count',
        yaxis_title='Code Presentation',
        title={'x': 0.5, 'xanchor': 'center'},
        template='plotly_white'
    )

    # Display the Plotly chart in Streamlit
    st.plotly_chart(fig_presentation)

    #4. Count of code_module
    st.subheader('Count of Code Modules')
    df_counts_module = data['code_module'].value_counts().reset_index()
    df_counts_module.columns = ['code_module', 'count']

    # bar chart with Plotly
    fig_module = px.bar(df_counts_module,
                        x='count',
                        y='code_module',
                        title='Count of Code Modules',
                        labels={'code_module': 'Code Module', 'count': 'Count'},
                        text='count',
                        color='code_module',
                        color_discrete_sequence=custom_colors,
                        orientation='h')

    # Update layout for better appearance
    fig_module.update_layout(
        xaxis_title='Count',
        yaxis_title='Code Module',
        title={'x': 0.5, 'xanchor': 'center'},
        template='plotly_white'
    )

    # Display the Plotly chart in Streamlit
    st.plotly_chart(fig_module)
def student_vle_eda(data):
    # Calculate daily interactions sum
    daily_interactions = data.groupby('date')['sum_click'].sum().reset_index()

    # Plotting with Plotly Express
    st.write("# Time Series Plot of Daily Student Interactions with VLE")

    fig = px.line(daily_interactions, x='date', y='sum_click',
                  title='Daily Student Interactions with VLE',
                  labels={'date': 'Date', 'sum_click': 'Sum of Clicks'})

    # Update layout for better appearance
    fig.update_layout(
        xaxis_title='Date',
        yaxis_title='Sum of Clicks',
        title={'x': 0.5, 'xanchor': 'center'},
        template='plotly_white'
    )

    # Display the Plotly chart in Streamlit
    st.plotly_chart(fig)


def reg_course_eda(data):
    #1 .Count of Code Presentations
    st.subheader('Count of Code Presentations')

    # Group by 'code_presentation' and count occurrences
    df_counts_presentation = data['code_presentation'].value_counts().reset_index()
    df_counts_presentation.columns = ['code_presentation', 'count']

    # bar chart with Plotly
    fig_presentation = px.bar(df_counts_presentation,
                              x='count',
                              y='code_presentation',
                              title='Count of Code Presentations',
                              labels={'code_presentation': 'Code Presentation', 'count': 'Count'},
                              color='code_presentation',  # Set color by code_presentation
                              color_discrete_sequence=px.colors.qualitative.Dark2)

    # Update layout for better appearance
    fig_presentation.update_layout(
        xaxis_title='Count',
        yaxis_title='Code Presentation',
        title={'x': 0.5, 'xanchor': 'center'},
        template='plotly_white'
    )

    # Display the Plotly chart in Streamlit
    st.plotly_chart(fig_presentation)

    #2 .Count of Code Modules
    st.subheader('Count of Code Modules')

    df_counts_module = data['code_module'].value_counts().reset_index()
    df_counts_module.columns = ['code_module', 'count']

    # bar chart with Plotly
    fig_module = px.bar(df_counts_module,
                        x='count',
                        y='code_module',
                        title='Count of Code Modules',
                        labels={'code_module': 'Code Module', 'count': 'Count'},
                        color='code_module',  # Set color by code_module
                        color_discrete_sequence=px.colors.qualitative.Dark2,
                        orientation='h')  # Horizontal bar chart

    # Update layout for better appearance
    fig_module.update_layout(
        xaxis_title='Count',
        yaxis_title='Code Module',
        title={'x': 0.5, 'xanchor': 'center'},
        template='plotly_white'
    )

    # Display the Plotly chart in Streamlit
    st.plotly_chart(fig_module)
def stu_reg_eda(data):
    # Display the header for the data
    st.subheader('Registration Count of Students With Time')

    # 1. Aggregate interactions per day
    daily_interactions = data.groupby('date_registration')['id_student'].nunique().reset_index()

    # Display the first few rows
    st.dataframe(daily_interactions.head())

    #time series plot with Plotly
    fig = px.line(
        daily_interactions,
        x='date_registration',
        y='id_student',
        title='Count of Student Registration with Time',
        labels={'date_registration': 'Date of Registration', 'id_student': 'Count of Students'},
        template='plotly_white'
    )

    # Update layout for better appearance
    fig.update_layout(
        xaxis_title='Date of Registration',
        yaxis_title='Count of Students',
        title={'x': 0.5, 'xanchor': 'center'},
        xaxis_tickangle=45
    )

    # 2. Display the Plotly chart in Streamlit
    st.plotly_chart(fig)

    #--------------------------------------------------------------------------------------------------------------
    st.subheader('Registration Count Per Module of Students With Time')

    # Aggregate the registration counts per module over time
    registration_trends = data.groupby(['date_registration', 'code_module']).size().reset_index(name='count')

    #line plot with Plotly
    fig = px.line(
        registration_trends,
        x='date_registration',
        y='count',
        color='code_module',
        title='Student Registration Trends for Each Module',
        labels={'date_registration': 'Date of Registration', 'count': 'Count of Registrations'},
        markers=True,
        template='plotly_white'
    )

    # Update layout for better appearance
    fig.update_layout(
        xaxis_title='Date of Registration',
        yaxis_title='Count of Registrations',
        title={'x': 0.5, 'xanchor': 'center'},
        xaxis_tickangle=45
    )

    # Display the Plotly chart in Streamlit
    st.plotly_chart(fig)

def info_stu_eda(info_stu):
    st.subheader('Head of the data frame')
    st.write(info_stu.head(10))
    # 1. Count of Gender
    st.subheader('Count of Gender')
    gender_counts = info_stu['gender'].value_counts().reset_index()
    gender_counts.columns = ['gender', 'count']
    custom_colors = px.colors.qualitative.Pastel

    fig = px.bar(gender_counts, x='count', y='gender',
                 title='Count of Gender',
                 labels={'gender': 'Gender', 'count': 'Count'},
                 text='count',
                 color='gender',
                 color_discrete_sequence=custom_colors)

    fig.update_layout(xaxis_title='Count', yaxis_title='Gender', template='plotly_white')
    st.plotly_chart(fig)

    # 2. Distribution of Gender
    st.subheader('Distribution of Gender')
    gender_counts = info_stu['gender'].value_counts().reset_index()
    gender_counts.columns = ['gender', 'count']

    fig = px.pie(gender_counts, names='gender', values='count',
                 title='Distribution of Gender',
                 color='gender',
                 color_discrete_sequence=px.colors.qualitative.Pastel)

    fig.update_layout(template='plotly_white')
    st.plotly_chart(fig)

    # 3. Count of Students by Age Band
    st.subheader('Count of Students by Age Band')
    age_band_counts = info_stu['age_band'].value_counts().reset_index()
    age_band_counts.columns = ['age_band', 'count']
    age_band_counts = age_band_counts.sort_values(by='age_band')

    fig = px.bar(age_band_counts, x='age_band', y='count',
                 title='Count of Students by Age Band',
                 labels={'age_band': 'Age Band', 'count': 'Count'},
                 text='count',
                 color='age_band',
                 color_discrete_sequence=px.colors.qualitative.Pastel)

    fig.update_layout(xaxis_title='Age Band', yaxis_title='Count', template='plotly_white')
    st.plotly_chart(fig)

    # 4. Count of Students by Region
    st.subheader('Count of Students by Region')
    region_counts = info_stu['region'].value_counts().reset_index()
    region_counts.columns = ['region', 'count']
    region_counts = region_counts.sort_values(by='region')

    fig = px.bar(region_counts, x='region', y='count',
                 title='Count of Students by Region',
                 labels={'region': 'Region', 'count': 'Count'},
                 text='count',
                 color='region',
                 color_discrete_sequence=px.colors.qualitative.Pastel)

    fig.update_layout(xaxis_title='Region', yaxis_title='Count', template='plotly_white')
    st.plotly_chart(fig)

    # 5. Stacked Bar Chart of Region by Age Band
    st.subheader('Stacked Bar Chart of Region by Age Band')
    cross_tab = pd.crosstab(info_stu['region'], info_stu['age_band'])
    age_bands = cross_tab.columns

    fig = go.Figure()
    for age_band in age_bands:
        fig.add_trace(go.Bar(
            x=cross_tab.index,
            y=cross_tab[age_band],
            name=f'Age Band {age_band}',
            marker_color=px.colors.qualitative.Pastel[age_bands.get_loc(age_band)],
            hovertemplate='%{y}',
            text=cross_tab[age_band],
            textposition='auto'
        ))

    fig.update_layout(barmode='stack', xaxis_title='Region', yaxis_title='Count',
                      title='Stacked Bar Chart of Region by Age Band',
                      legend_title='Age Band', template='plotly_white')
    st.plotly_chart(fig)

    # 6. Box Plot of Studied Credits by Region
    st.subheader('Box Plot of Studied Credits by Region ')
    fig = px.box(info_stu, x='studied_credits', y='region', color='region',
                 orientation='h', title='Boxplot of Studied Credits by Region',
                 labels={'region': 'Region', 'studied_credits': 'Studied Credits'},
                 category_orders={'region': sorted(info_stu['region'].unique())})

    fig.update_layout(xaxis_title='Studied Credits', yaxis_title='Region', template='plotly_white')
    fig.update_traces(whiskerwidth=0.5)
    st.plotly_chart(fig)

    # 7.Count Plot of Highest Education
    st.subheader('Count Plot of Highest Education')
    education_counts = info_stu['highest_education'].value_counts().reset_index()
    education_counts.columns = ['highest_education', 'count']
    education_counts = education_counts.sort_values(by='count', ascending=False)

    fig = px.bar(education_counts, x='count', y='highest_education',
                 title='Count of Highest Education',
                 labels={'highest_education': 'Highest Education', 'count': 'Count'},
                 orientation='h',
                 color='highest_education',
                 color_discrete_sequence=px.colors.qualitative.Dark2)

    fig.update_layout(xaxis_title='Count', yaxis_title='Highest Education', template='plotly_white')
    st.plotly_chart(fig)

    # 8. Count Of Exam Results
    st.subheader('Count Of Exam Results')
    activity_counts = info_stu['final_result'].value_counts().reset_index()
    activity_counts.columns = ['final_result', 'count']
    activity_counts = activity_counts.sort_values(by='count', ascending=False)

    fig = px.bar(activity_counts, x='final_result', y='count',
                 title='Count of Exam Results',
                 labels={'final_result': 'Result Status', 'count': 'Count'},
                 color='final_result',
                 color_discrete_sequence=px.colors.qualitative.Set2)

    fig.update_layout(xaxis_title='Result Status', yaxis_title='Count', template='plotly_white')
    st.plotly_chart(fig)

    # 9. Final Results by Gender
    st.subheader('Final Results by Gender')
    final_result_gender_counts = info_stu.groupby(['final_result', 'gender']).size().reset_index(name='count')
    sorted_results = final_result_gender_counts.groupby('final_result').sum().sort_values(by='count', ascending=False).index

    fig = px.bar(final_result_gender_counts, x='final_result', y='count', color='gender',
                 title='Final Results by Gender (Sorted)',
                 labels={'final_result': 'Final Result', 'count': 'Count'},
                 category_orders={'final_result': sorted_results},
                 color_discrete_sequence=px.colors.qualitative.Set2)

    fig.update_layout(xaxis_title='Final Result', yaxis_title='Count', template='plotly_white')
    st.plotly_chart(fig)

    # 10. Final Results by Region
    st.subheader('Final Results by Region')
    final_result_region_counts = info_stu.groupby(['final_result', 'region']).size().reset_index(name='count')
    sorted_results = final_result_region_counts.groupby('final_result').sum().sort_values(by='count', ascending=False).index
    pivot_data = final_result_region_counts.pivot(index='final_result', columns='region', values='count').reindex(sorted_results).fillna(0)

    fig = go.Figure()
    for region in pivot_data.columns:
        fig.add_trace(go.Bar(
            x=pivot_data.index,
            y=pivot_data[region],
            name=region,
            hovertemplate='%{y}',
            text=pivot_data[region],
            textposition='auto'
        ))

    fig.update_layout(barmode='stack', xaxis_title='Final Result', yaxis_title='Count',
                      title='Final Results by Region (Stacked)', template='plotly_white')
    st.plotly_chart(fig)

    # 11. Final Results by Highest Education
    st.subheader('Final Results by Highest Education')
    final_result_education_counts = info_stu.groupby(['final_result', 'highest_education']).size().reset_index(name='count')
    sorted_results = final_result_education_counts.groupby('final_result').sum().sort_values(by='count', ascending=False).index
    pivot_data = final_result_education_counts.pivot(index='final_result', columns='highest_education', values='count').reindex(sorted_results).fillna(0)

    fig = go.Figure()
    for education in pivot_data.columns:
        fig.add_trace(go.Bar(
            x=pivot_data.index,
            y=pivot_data[education],
            name=education,
            hovertemplate='%{y}',
            text=pivot_data[education],
            textposition='auto'
        ))

    fig.update_layout(barmode='stack', xaxis_title='Final Result', yaxis_title='Count',
                      title='Final Results by Highest Education (Stacked)', template='plotly_white')
    st.plotly_chart(fig)

    # 12. Final Results by Module
    st.subheader('Final Results by Module')
    final_result_module_counts = info_stu.groupby(['final_result', 'code_module']).size().reset_index(name='count')
    sorted_results = final_result_module_counts.groupby('final_result').sum().sort_values(by='count', ascending=False).index
    pivot_data = final_result_module_counts.pivot(index='final_result', columns='code_module', values='count').reindex(sorted_results).fillna(0)

    fig = go.Figure()
    for module in pivot_data.columns:
        fig.add_trace(go.Bar(
            x=pivot_data.index,
            y=pivot_data[module],
            name=module,
            hovertemplate='%{y}',
            text=pivot_data[module],
            textposition='auto'
        ))

    fig.update_layout(barmode='stack', xaxis_title='Final Result', yaxis_title='Count',
                      title='Final Results by Module (Stacked)', template='plotly_white')
    st.plotly_chart(fig)

    # 13. Final Results by Module Grouped by Module
    st.subheader('Final Results by Module Grouped by Module')
    modules = info_stu['code_module'].unique()
    for mname in modules:
        data = info_stu[info_stu['code_module'] == mname]
        final_result_education_counts = data.groupby(['final_result', 'highest_education']).size().reset_index(name='count')
        sorted_results = final_result_education_counts.groupby('final_result').sum().index
        pivot_data = final_result_education_counts.pivot(index='final_result', columns='highest_education', values='count').reindex(sorted_results).fillna(0)

        fig = go.Figure()
        for education in pivot_data.columns:
            fig.add_trace(go.Bar(
                x=pivot_data.index,
                y=pivot_data[education],
                name=education,
                hovertemplate='%{y}',
                text=pivot_data[education],
                textposition='auto'
            ))

        fig.update_layout(barmode='stack', xaxis_title='Final Result', yaxis_title='Count',
                          title=f'Final Results by Highest Education (Stacked) - Module {mname}', template='plotly_white')
        st.plotly_chart(fig)

    # 14. Correlation Matrix
    st.subheader('Correlation Matrix')
    student_info = info_stu.copy()

    # Preprocess the data
    student_info['gender'] = student_info['gender'].map({'M': 0, 'F': 1})
    student_info['region'] = student_info['region'].astype('category').cat.codes
    student_info['highest_education'] = student_info['highest_education'].astype('category').cat.codes
    student_info['imd_band'] = student_info['imd_band'].astype('category').cat.codes
    student_info['age_band'] = student_info['age_band'].astype('category').cat.codes
    student_info['disability'] = student_info['disability'].map({'N': 0, 'Y': 1})
    student_info['code_module'] = student_info['code_module'].astype('category').cat.codes
    student_info['code_presentation'] = student_info['code_presentation'].astype('category').cat.codes
    final_result_map = {'Distinction': 4, 'Pass': 3, 'Fail': 2, 'Withdrawn': 1}
    student_info['final_result'] = student_info['final_result'].map(final_result_map)

    correlation_matrix = student_info.corr()

    fig = px.imshow(correlation_matrix, text_auto=True, aspect='auto', color_continuous_scale=px.colors.sequential.Blues)
    fig.update_layout(title='Correlation Matrix with Final Result as Target', template='plotly_white')
    st.plotly_chart(fig)


def as_stu_eda(as_stu):
    # Display the head of student information data
    st.subheader('Head of Student Information Data')
    st.write(as_stu.head(10))

    # Score Trend Over Time
    st.subheader('Score Trend Over Time')

    # Calculate average score and categorize scores
    average_score = as_stu.groupby('date_submitted')['score'].mean().reset_index()
    average_score['Score_Category'] = pd.cut(average_score['score'], bins=[-float('inf'), 40, float('inf')],
                                             labels=['Fail (< 40)', 'Pass (>= 40)'])

    # Create an interactive line plot with Plotly
    fig = px.line(average_score, x='date_submitted', y='score', color='Score_Category', markers=True,
                  title='Score Trend Over Time',
                  labels={'date_submitted': 'Date Submitted', 'score': 'Average Score'},
                  color_discrete_sequence=px.colors.qualitative.Set2)

    # Update layout for better appearance
    fig.update_layout(xaxis_title='Date', yaxis_title='Score', template='plotly_white')
    fig.update_xaxes(tickangle=45)
    fig.update_layout(legend_title_text='Score Category')

    # Add markers and lines
    fig.add_scatter(x=average_score['date_submitted'], y=average_score['score'],
                    mode='markers+lines', marker=dict(size=10, symbol='circle'))

    # Add hover information
    fig.update_traces(hovertemplate='Date: %{x}<br>Average Score: %{y}')

    # Display the Plotly chart in Streamlit
    st.plotly_chart(fig)


@st.cache_data

def data_merge(as_stu, info_stu):
    # Clean info_stu dataset
    info_stu_cleaned = info_stu.copy()
    info_stu_cleaned['imd_band'].fillna('unknown', inplace=True)  # Fill missing values in 'imd_band'

    # Clean as_stu dataset
    as_stu_cleaned = as_stu.copy()
    as_stu_cleaned.dropna(subset=['score'], inplace=True)  # Drop rows where 'score' is NaN

    # Convert date_submitted to datetime format (assuming start_date is defined)
    start_date = pd.to_datetime('2023-01-01')  # Replace with your actual start date
    as_stu_cleaned['date_submitted'] = start_date + pd.to_timedelta(as_stu_cleaned['date_submitted'] - 1, unit='D')

    # Merge datasets
    data = pd.merge(as_stu_cleaned, info_stu_cleaned, on='id_student', how='left')

    # Fill missing values in 'score' with 0
    data['score'].fillna(0, inplace=True)

    # Convert categorical columns to numerical
    data['gender'] = data['gender'].map({'M': 0, 'F': 1})
    data['region'] = data['region'].astype('category').cat.codes
    data['highest_education'] = data['highest_education'].astype('category').cat.codes
    data['imd_band'] = data['imd_band'].astype('category').cat.codes
    data['age_band'] = data['age_band'].astype('category').cat.codes
    data['disability'] = data['disability'].map({'N': 0, 'Y': 1})
    data['code_module'] = data['code_module'].astype('category').cat.codes
    data['code_presentation'] = data['code_presentation'].astype('category').cat.codes

    # Map final_result to numerical values
    final_result_map = {
        'Distinction': 4,
        'Pass': 3,
        'Fail': 2,
        'Withdrawn': 1
    }
    data['final_result'] = data['final_result'].map(final_result_map)

    return data


def decision_tree(student_info_assessments):

    st.subheader('Decision Tree Regressor')

    st.write("""
    my_dt = DecisionTreeRegressor(
        max_depth=10,                 Limits depth of the tree
        min_samples_split=15,        Requires at least 15 samples to consider a split
        min_samples_leaf=5,         Requires at least 5 samples per leaf
        max_leaf_nodes=200,          Maximum number of leaf nodes
    )
    my_dt.fit(X_train, y_train)  # Fit to training data
    """)

    # Drop rows with any missing values in the features and target
    data = student_info_assessments.dropna(subset=['score'])
    X = data.drop(columns=['id_student', 'score']).select_dtypes(include=[np.number])
    y = data['score']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the DecisionTreeRegressor with specific parameters to regularize the tree
    my_dt = DecisionTreeRegressor(
        max_depth=10,                # Limits depth of the tree
        min_samples_split=15,       # Requires at least 20 samples to consider a split
        min_samples_leaf=5,        # Requires at least 10 samples per leaf
        max_leaf_nodes=200,         # Maximum number of leaf nodes
    )
    my_dt.fit(X_train, y_train)  # Fit to training data

    # Predict and calculate R2 score
    y_pred = my_dt.predict(X_test)

    st.write("MSE: ", mean_squared_error(y_test, y_pred))
    st.write("MAE: ", mean_absolute_error(y_test, y_pred))
    st.write("R2 Score: ", r2_score(y_test, y_pred))

    # Plot the decision tree
    plt.figure(figsize=(20, 10))  # Set the size of the plot according to your preference
    plot_tree(my_dt, feature_names=X.columns, filled=True)
    plt.title('Decision Tree Visualization')
    st.pyplot(plt.gcf())

    # Plot actual vs predicted values
    plt.figure(figsize=(14, 7))
    plt.plot(y_test.values, label='Actual Values', marker='o')
    plt.plot(y_pred, label='Predicted Values', marker='x')
    plt.title('Actual vs Predicted Values')
    plt.xlabel('Index')
    plt.ylabel('Score')
    plt.legend()
    st.pyplot(plt.gcf())

    st.subheader('Decision Tree Regressor with Pruning')

    # Get the cost complexity pruning path
    path = my_dt.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities

    st.write('Total Impurity vs effective alpha for training set')

    plt.figure(figsize=(10, 6))
    plt.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
    plt.xlabel("effective alpha")
    plt.ylabel("total impurity of leaves")
    plt.title("Total Impurity vs effective alpha for training set")
    st.pyplot(plt.gcf())

    alpha_selected = ccp_alphas[2]

    st.write('Optimal alpha selected is:: ', alpha_selected)

    # Re-train the tree with the selected alpha
    my_dt_pruned = DecisionTreeRegressor(
        random_state=44,
        max_depth=10,
        min_samples_split=15,
        min_samples_leaf=5,
        max_leaf_nodes=20,
        ccp_alpha=alpha_selected
    )
    my_dt_pruned.fit(X_train, y_train)

    # Evaluate the pruned tree
    y_pred = my_dt_pruned.predict(X_test)
    st.write("MSE after pruning:", mean_squared_error(y_test, y_pred))
    st.write("MAE after pruning:", mean_absolute_error(y_test, y_pred))
    st.write("R2 Score: ", r2_score(list(y_test), list(my_dt_pruned.predict(X_test))))

    # Plot the decision tree
    plt.figure(figsize=(20,10))  # Set the size of the plot according to your preference
    plot_tree(my_dt_pruned, feature_names=X.columns, filled=True)
    plt.title('Pruned Decision Tree Visualization')
    st.pyplot(plt.gcf())

    # Plot actual vs predicted values
    plt.figure(figsize=(14, 7))
    plt.plot(y_test.values, label='Actual Values', marker='o')
    plt.plot(y_pred, label='Predicted Values', marker='x')
    plt.title('Actual vs Predicted Values')
    plt.xlabel('Index')
    plt.ylabel('Score')
    plt.legend()
    st.pyplot(plt.gcf())


def linear_reg(student_info_assessments):

  st.subheader('Linear Regression')

  st.write("""
  my_lm = LinearRegression()
  """)


  data = student_info_assessments.dropna(subset=['score'])
  X = data.drop(columns=['id_student', 'score']).select_dtypes(include=[np.number])
  y = data['score']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  my_lm = LinearRegression()
  my_lm.fit(X = X_train, y = y_train)

  y_pred = my_lm.predict(X_test)
  st.write("MSE ", mean_squared_error(y_test, y_pred))
  st.write("MAE ", mean_absolute_error(y_test, y_pred))
  st.write("R2_score ", r2_score(y_test, y_pred))

  results = pd.DataFrame({'Actual': y_test[1:1000], 'Predicted': y_pred[1:1000]})

  # Plot the results
  plt.figure(figsize=(10, 6))
  sns.scatterplot(x='Actual', y='Predicted', data=results, alpha=0.5, color='green')
  sns.lineplot(x='Actual', y='Actual', data=results, color='red')  # Diagonal line

  # Add titles and labels
  plt.title('Actual vs Predicted Scores')
  plt.xlabel('Actual Scores')
  plt.ylabel('Predicted Scores')
  st.pyplot(plt.gcf())


def arima(df):

  st.subheader('ARIMA')

  st.write("""
  model = auto_arima(train_target,
                      start_p=1, start_q=1,
                      test='adf',       # Use ADF test to find optimal 'd'
                      max_p=3, max_q=3, # Maximum p and q
                      m=1,              # Frequency of series
                      d=0,           # Let model determine 'd'
                      seasonal=False,   # No seasonality
                      start_P=0,
                      trace=True,       # Print status
                      error_action='ignore',
                      suppress_warnings=True,
                      stepwise=True)    # Apply stepwise algorithm
  """)


  data = df[['date_submitted','score']].copy()

  # Set date_submitted as index]
  data = data.set_index('date_submitted')

  st.subheader('Testing for Stationarity')

  for col in list(data):
    d = nsdiffs(data[col],
            m=10,
            max_D=12,
            test='ch')

    st.write('Columns:: ', col, ' || d:: ', d)

  plot_acf(data['score'], lags=40)
  plot_pacf(data['score'], lags=40)
  st.pyplot(plt.gcf())

  data_daily = data.resample('W').mean()
  data_daily = data_daily.sort_index()
  data_daily = data_daily.interpolate(method='time')

  plt.plot(data_daily['score'])
  plt.title(f'Trend for score ')
  plt.xlabel('Date Submitted')
  plt.ylabel('Score')
  plt.show()

  adf_test = adfuller(data_daily['score'])
  # Output the results
  st.write('ADF Statistic: %f' % adf_test[0])
  st.write('p-value: %f' % adf_test[1])
  st.write('Since p-value < 0.05, stationarity doesnt exist (d = 0)')

  train_size = int(len(data_daily) * 0.8)
  train_data, test_data = data_daily.iloc[:train_size], data_daily.iloc[train_size:]

  # Target variable
  train_target = train_data['score']
  test_target = test_data['score']

  #Running Auto ARIMA
  model = auto_arima(train_target,
                        start_p=1, start_q=1,
                        test='adf',       # Use ADF test to find optimal 'd'
                        max_p=1, max_q=1, # Maximum p and q
                        m=1,              # Frequency of series
                        d=0,           # Let model determine 'd'
                        seasonal=False,   # No seasonality
                        start_P=0,
                        trace=True,       # Print status
                        error_action='ignore',
                        suppress_warnings=True,
                        stepwise=True)    # Apply stepwise algorithm

  st.subheader('Arima Model Summary')
  st.write(model.summary())

  st.subheader('Forecasting')

  #forecasting
  n_periods = len(test_target)
  forecast, conf_int = model.predict(n_periods=n_periods, return_conf_int=True)

  # Convert predictions to a DataFrame
  forecast_df = pd.DataFrame(forecast, index=test_target.index, columns=['Forecast'])
  conf_int_df = pd.DataFrame(conf_int, index=test_target.index, columns=['Lower CI', 'Upper CI'])

  mae = mean_absolute_error(test_target, forecast)
  rmse = np.sqrt(mean_squared_error(test_target, forecast))
  st.write(f'MAE: {mae}')
  st.write(f'RMSE: {rmse}')

  plt.figure(figsize=(12, 6))
  plt.plot(train_target, label='Train')
  plt.plot(test_target, label='Test')
  plt.plot(forecast_df, label='Forecast')
  plt.fill_between(conf_int_df.index,
                  conf_int_df['Lower CI'],
                  conf_int_df['Upper CI'],
                  color='k', alpha=.15)
  plt.legend()
  st.pyplot(plt.gcf())

def merge_data_analysis():
  df_assessments_full = pd.merge(as_stu, assesment, on='id_assessment', how='left')
  df_student_course = pd.merge(info_stu, course, on=['code_module', 'code_presentation'], how='left')
  df_student_full = pd.merge(df_student_course, reg_stu, on=['code_module', 'code_presentation', 'id_student'], how='left')
  df_student_full = pd.merge(df_student_full, vle_stu, on=['code_module', 'code_presentation', 'id_student'], how='left')

  return df_student_full, df_assessments_full

def dropout_analysis():
  # Merge dataframes
  df_student_full, df_assessments_full = merge_data_analysis()

  # Handle missing values
  df_student_full.fillna(0, inplace=True)

  # Convert categorical columns to strings
  categorical_columns = ['gender', 'region', 'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts', 'final_result']
  for column in categorical_columns:
      df_student_full[column] = df_student_full[column].astype(str)

  # Encode categorical variables
  label_encoder = LabelEncoder()
  for column in categorical_columns:
      df_student_full[column] = label_encoder.fit_transform(df_student_full[column])

  # Impute missing values
  # Define imputers for different types of columns
  num_imputer = SimpleImputer(strategy='mean')
  cat_imputer = SimpleImputer(strategy='most_frequent')

  # Identify numerical and categorical features
  num_features = df_student_full.select_dtypes(include=['int64', 'float64']).columns
  cat_features = df_student_full.select_dtypes(include=['object']).columns

  # Apply imputers
  df_student_full[num_features] = num_imputer.fit_transform(df_student_full[num_features])
  df_student_full[cat_features] = cat_imputer.fit_transform(df_student_full[cat_features])

  # Feature Engineering
  df_student_full['total_assessment_score'] = df_assessments_full.groupby('id_student')['score'].transform('sum')
  df_student_full['interaction_per_module'] = df_student_full.groupby(['code_module', 'code_presentation'])['sum_click'].transform('mean')

  # Define dropout based on unregistration date (1 if date_unregistration is not NaN, else 0)
  df_student_full['dropout'] = df_student_full['date_unregistration'].apply(lambda x: 1 if not pd.isna(x) else 0)

  # Part 2: Predicting Student Dropout
  dropout_features = [
      'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts',
      'studied_credits', 'sum_click', 'total_assessment_score'
  ]

  df_student_full.fillna(0, inplace=True)

  X_dropout = df_student_full[dropout_features]
  y_dropout = df_student_full['dropout']

  # Standardize features
  scaler = StandardScaler()
  X_dropout_scaled = scaler.fit_transform(X_dropout)

  # Train-test split
  X_train, X_test, y_train, y_test = train_test_split(X_dropout_scaled, y_dropout, test_size=0.3, random_state=42)

  # Train a RandomForestClassifier
  model = RandomForestClassifier(n_estimators=100, random_state=42)
  model.fit(X_train, y_train)

  # Predict
  y_pred = model.predict(X_test)

  # Evaluate
  st.write(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
  st.write(classification_report(y_test, y_pred))
  conf_matrix = confusion_matrix(y_test, y_pred)
  sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
  plt.title('Confusion Matrix for Dropout Prediction')
  plt.xlabel('Predicted')
  plt.ylabel('Actual')
  st.pyplot(plt.gcf())


# -------------------------------------Main Code--------------------------------------------
# -------------------------------------Main Code--------------------------------------------
# -------------------------------------Main Code--------------------------------------------
# -------------------------------------Main Code--------------------------------------------


# Title
st.title('Open University Learning Analytics Dataset (OULAD) Analysis')

# Subheading
st.markdown("### Dataset Visualization and Analysis")

# Sidebar for navigation
st.sidebar.header("Contents")
sections = [
    "Introduction", "Data Overview", "Missing Value Analysis","Exploratory Data Analysis (EDA)", "Machine Learning (ML)"
]

start_date = pd.Timestamp('2000-01-01')

choice = st.sidebar.radio("Select a section:", sections)

# Introduction to Streamlit
if choice == "Introduction":

    st.header("Introduction to Dataset")
    st.markdown("""
    This dataset belongs to Open University Online Learning Platform (Also called as "Virtual Learning Environment(VLE)") that off-campus students use for accessing the course content, forum discussions, sending assessments and checking out assignment marks etc. It consists of 7 selected courses (mentioned as modules in the dataset). Different presentations indicated with letters "B" and "J" after year for semester 2 and semester 1 respectively.

    Additionally, the dataset includes student demographics such as location, age group, disability, education level, gender etc.
    Student assessment marks, interactions with the Virtual Learning Environment (VLE) are also included.

    It contains data about courses, students and their interactions with Virtual Learning Environment (VLE) for seven selected courses (called modules).
    Presentations of courses start in February and October - they are marked by “B” and “J” respectively. The dataset consists of tables connected using unique identifiers.
    """)


#--------------------------------------------------------------------------------------------------------------------------------
elif choice == "Data Overview":
    st.header("Data Overview")

    datasets = {
        "VLE": vle,
        "Student Interactions": vle_stu,
        "Student Registration": reg_stu,
        "Student Info": info_stu,
        "Student Assessments": as_stu,
        "Courses": course,
        "Assessments": assesment
    }

    dataset_choice = st.selectbox("Select Dataset for Overview:", list(datasets.keys()))

    #if dataset_choice:
    if dataset_choice == 'VLE':
        st.subheader(f"Overview for {dataset_choice}")
        # Column descriptions
        st.markdown("""
         1. **id_site**: Identification number of the material.
         2. **code_module**: Identification code for the module.
         3. **code_presentation**: Identification code of the presentation.
         4. **activity_type**: Role associated with the module material.
         5. **week_from**: Week from which the material is planned to be used.
         6. **week_to**: Week until which the material is planned to be used.
         """)
        st.write(datasets[dataset_choice][:1000])
    elif dataset_choice == 'Student Interactions':
        st.subheader(f"Overview for {dataset_choice}")
        # Column descriptions
        st.markdown("""
        1. **code_module**: Identification code for a module.
        2. **code_presentation**: Identification code of the module presentation.
        3. **id_student**: Unique identification number for the student.
        4. **id_site**: Identification number for the VLE material.
        5. **date**: Date of student’s interaction with the material, measured as the number of days since the start of the module-presentation.
        6. **sum_click**: Number of times a student interacts with the material.
        """)
        st.write(datasets[dataset_choice][:1000])

    elif dataset_choice == 'Student Registration':
        st.subheader(f"Overview for {dataset_choice}")
        # Column descriptions
        st.markdown("""
        1. **code_module**: Identification code for a module.
        2. **code_presentation**: Identification code of the presentation.
        3. **id_student**: Unique identification number for the student.
        4. **date_registration**: Date of student’s registration on the module presentation, measured as the number of days relative to the start of the module-presentation. Negative values indicate registration before the start.
        5. **date_unregistration**: Date of student’s un-registration from the module presentation, measured as the number of days relative to the start of the module-presentation.
        """)
        st.write(datasets[dataset_choice][:1000])
    elif dataset_choice == 'Student Info':
        st.subheader(f"Overview for {dataset_choice}")
        # Column descriptions
        st.markdown("""
      1. **code_module**: Identification code for the module.
      2. **code_presentation**: Identification code of the presentation.
      3. **id_student**: Unique identification number for the student.
      4. **gender**: Student’s gender.
      5. **region**: Geographic region where the student lived during the module presentation.
      6. **highest_education**: Highest education level of the student upon entry to the module presentation.
      7. **imd_band**: Index of Multiple Deprivation band of the place where the student lived during the module presentation.
      8. **age_band**: Age band of the student.
      9. **num_of_prev_attempts**: Number of times the student has attempted this module.
      10. **studied_credits**: Total number of credits for the modules the student is currently studying.
      11. **disability**: Indicates whether the student has declared a disability.
      12. **final_result**: Student’s final result in the module presentation.
      """)
        st.write(datasets[dataset_choice][:1000])
    elif dataset_choice == 'Student Assessments':
        st.subheader(f"Overview for {dataset_choice}")
        # Column descriptions
        st.markdown("""
        1. **id_assessment**: Identification number of the assessment.
        2. **id_student**: Unique identification number for the student.
        3. **date_submitted**: Date of student submission, measured as the number of days since the start of the module presentation.
        4. **is_banked**: Status flag indicating whether the assessment result has been transferred from a previous presentation.
        5. **score**: Student’s score in this assessment. The range is from 0 to 100. Scores lower than 40 are interpreted as Fail.
        """)

        st.write(datasets[dataset_choice][:1000])
    elif dataset_choice == 'Courses':
        st.subheader(f"Overview for {dataset_choice}")
        # Column descriptions
        st.markdown("""
        1. **code_module**: Code name of the module, which serves as the identifier.
        2. **code_presentation**: Code name of the presentation. It consists of the year and "B" for February start or "J" for October start.
        3. **length**: Length of the module-presentation in days.
        """)
        st.write(datasets[dataset_choice][:1000])
    elif dataset_choice == 'Assessments':

        st.subheader(f"Overview for {dataset_choice}")
        # Column descriptions
        st.markdown("""
        1. **code_module**: Identification code of the module to which the assessment belongs.
        2. **code_presentation**: Identification code of the presentation to which the assessment belongs.
        3. **id_assessment**: Identification number of the assessment.
        4. **assessment_type**: Type of assessment. Three types exist: Tutor Marked Assessment (TMA), Computer Marked Assessment (CMA), and Final Exam (Exam).
        5. **date**: Final submission date of the assessment, measured as the number of days since the start of the module-presentation.
        6. **weight**: Weight of the assessment in %. Typically, Exams are treated separately and have the weight 100%; the sum of all other assessments is 100%. If the information about the final exam date is missing, it is at the end of the last presentation week.
        """)
        st.write(datasets[dataset_choice][:1000])

#--------------------------------------------------------------------------------------------------------------------------------

elif choice == "Missing Value Analysis":
    st.header("Missing Value Treatment")

    datasets = {
        "VLE": vle,
        "Student Interactions": vle_stu,
        "Student Registration": reg_stu,
        "Student Info": info_stu,
        "Student Assessments": as_stu,
        "Courses": course,
        "Assessments": assesment
    }

    dataset_choice = st.selectbox("Select Dataset for Missing Value Analysis:", list(datasets.keys()))

    if dataset_choice == 'VLE':
      st.subheader(f"Missing Value Analysis for {dataset_choice}")
      missingValueAssessment(datasets[dataset_choice])

      st.subheader(f"Missing Value Treatment for {dataset_choice}")
      st.write("Since the 'Week From' and 'Week to' has around 82% missing values, it is best to drop these columns.")
      st.write ("vle.drop(columns=['week_from','week_to'],inplace=True)")

    elif dataset_choice == 'Student Interactions':
      st.subheader(f"Missing Value Analysis for {dataset_choice}")
      missingValueAssessment(datasets[dataset_choice])

      st.write('No Missing Values')
      st.subheader('Transforming date to datetime')
      st.write("""
      - Define the start date
      start_date = pd.Timestamp('2000-01-01')

      - Convert 'date' to datetime
      vle_stu['date'] = vle_stu['date'].apply(lambda x: start_date + pd.Timedelta(days=x-1))
      """)

    elif dataset_choice == 'Student Registration':
      st.subheader(f"Missing Value Analysis for {dataset_choice}")
      missingValueAssessment(datasets[dataset_choice])

      st.subheader(f"Missing Value Treatment for {dataset_choice}")
      st.write("Since the 'date_unregistration' has around 65% missing values, it is best to drop these columns.")

      st.subheader('Transforming date_registration to datetime')
      st.write("""
      Define the start date
      start_date = pd.Timestamp('2000-01-01')

      Convert 'date_submitted' to datetime
      reg_stu['date_registration'] = reg_stu['date_registration'].apply(lambda x: start_date + pd.Timedelta(days=x-1))
      """)

    elif dataset_choice == 'Student Info':
      st.subheader(f"Missing Value Analysis for {dataset_choice}")
      missingValueAssessment(datasets[dataset_choice])

      st.subheader(f"Missing Value Treatment for {dataset_choice}")
      st.write("Since 'imd_band' has missing value it's better to impute it with dummy value")
      st.write("info_stu['imd_band'].fillna('unknown', inplace=True)  # Filling missing values with 'unknown'")

    elif dataset_choice == 'Student Assessments':
      st.subheader(f"Missing Value Analysis for {dataset_choice}")
      missingValueAssessment(datasets[dataset_choice])

      st.subheader(f"Missing Value Treatment for {dataset_choice}")
      st.write("Score has missing values. Imputing the missing value with the mean of the score achieved that particular student")
      st.write("""
      student_means = as_stu.groupby('id_student')['score'].transform('mean')
      as_stu['score'].fillna(student_means, inplace=True)
      """)

      st.subheader('Transforming date_submitted to datetime')
      st.write("""
      Define the start date
      start_date = pd.Timestamp('2000-01-01')

      Convert 'date_submitted' to datetime
      as_stu['date_submitted'] = as_stu['date_submitted'].apply(lambda x: start_date + pd.Timedelta(days=x-1))
      """)

    elif dataset_choice == 'Courses':
      st.subheader(f"Missing Value Analysis for {dataset_choice}")
      missingValueAssessment(datasets[dataset_choice])

      st.write('No Missing Values')

    elif dataset_choice == 'Assessments':

      st.subheader(f"Missing Value Analysis for {dataset_choice}")
      missingValueAssessment(datasets[dataset_choice])

      st.write("'Date' has around 5% missing values so it's better that we drop these missing values as we don't if this exam has been conducted or not")




#--------------------------------------------------------------------------------------------------------------------------------------

elif choice == "Exploratory Data Analysis (EDA)":

  start_date = pd.Timestamp('2000-01-01')

  datasets = [
        "VLE",
        "Student Interactions",
        "Student Registration",
        "Student Info",
        "Student Assessments"
  ]

  dataset_choice = st.selectbox("Select Dataset for EDA:", datasets)

  if dataset_choice == 'VLE':
    st.subheader(f"EDA for {dataset_choice}")

    vle_cleaned = vle.copy()
    vle_cleaned.drop(columns=['week_from','week_to'],inplace=True)

    plotActivityCounts(vle_cleaned)

  elif dataset_choice == 'Student Interactions':

    vle_stu_cleaned = vle_stu.copy()
    # Convert 'date' to datetime
    vle_stu_cleaned['date'] = vle_stu_cleaned['date'].apply(lambda x: start_date + pd.Timedelta(days=x-1))


    student_vle_eda(vle_stu_cleaned)

  elif dataset_choice == 'Student Registration':

    reg_stu_cleaned = reg_stu.copy()
    reg_stu_cleaned.drop(columns=['date_unregistration'],inplace=True)
    reg_stu_cleaned['date_registration'] = pd.to_numeric(reg_stu_cleaned['date_registration'])
    reg_stu_cleaned = reg_stu_cleaned.dropna()
    reg_stu_cleaned['date_registration'] = reg_stu_cleaned['date_registration'].apply(lambda x: start_date + pd.Timedelta(days=x-1))

    stu_reg_eda(reg_stu_cleaned)

  elif dataset_choice == 'Student Info':
    info_stu_cleaned = info_stu.copy()
    info_stu_cleaned['imd_band'] = info_stu_cleaned['imd_band'].fillna('unknown', inplace=True)
    info_stu_cleaned['num_of_prev_attempts'] = info_stu_cleaned['num_of_prev_attempts'].fillna(info_stu['num_of_prev_attempts'].mean(), inplace=True)

    info_stu_eda(info_stu_cleaned)

  elif dataset_choice == 'Student Assessments':

    as_stu_cleaned = as_stu.copy()
    student_means = as_stu_cleaned.groupby('id_student')['score'].transform('mean')
    as_stu_cleaned['score'] = as_stu_cleaned['score'].fillna(student_means, inplace=True)
    as_stu_cleaned['date_submitted'] = as_stu_cleaned['date_submitted'].apply(lambda x: start_date + pd.Timedelta(days=x-1))

    as_stu_eda(as_stu_cleaned)

elif choice == "Machine Learning (ML)":

  student_info_assessments = data_merge(as_stu, info_stu)


  # Combine the assessments information
  student_info_assessments = data_merge(as_stu, info_stu)

  models = ['Introduction','Linear Regression', 'Decision Tree Regressor', 'ARIMA', 'Dropout_analysis']

  model_choice = st.selectbox("Select Model for Prediction:", models)



  if model_choice == 'Introduction':
    st.subheader("Data Merging and Splitting for Modeling")

    st.write('Data Selected: Student Assessment & Student Information')
    st.write('data_merge(as_stu, info_stu)')
    st.write("Final Dataset Size:: 207319 rows × 15 columns")
    st.write ("Target Column: score")
    st.write("Date column:: date_submitted")

    st.write(student_info_assessments.head(150))

    st.write("Spliting the data into 80% training set and 20% test set")
    st.write("X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)")

  elif(model_choice == 'Linear Regression'):
    linear_reg(student_info_assessments)
  elif model_choice == 'Decision Tree Regressor':
    decision_tree(student_info_assessments)
  elif model_choice == 'ARIMA':
    arima(student_info_assessments)
  elif model_choice == 'Dropout_analysis':
    dropout_analysis()










Overwriting app.py


In [44]:
!npm install localtunnel

[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35msaveError[0m ENOENT: no such file or directory, open '/content/package.json'
[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35menoent[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No description
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No repository field.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No README data
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No license field.
[0m
+ localtunnel@2.0.2
updated 1 package and audited 36 packages in 0.819s

3 packages are looking for funding
  run `npm fund` for details

found 2 [93mmoderate[0m severity vulnerabilities
  run `npm audit fix` to fix them, or `npm audit` for details
[K[?25h

In [57]:
!streamlit run /content/app.py --server.port 8055 &>/content/logs.txt & curl ipv4.icanhazip.com

34.139.254.253


In [58]:
!npx localtunnel --port 8055


[K[?25hnpx: installed 22 in 2.318s
your url is: https://red-lamps-type.loca.lt
^C
