In [23]:
# Imports

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score  
import joblib

In [24]:
# Rename function

def rename(df):
    df.rename(columns = {'1. What is your age?':'Age','2. Gender':'Sex','3. Relationship Status':'Relationship Status',
       '4. Occupation Status':'Occupation',
       '5. What type of organizations are you affiliated with?':'Affiliations',
       '6. Do you use social media?':'Social Media User?',
       '7. What social media platforms do you commonly use?':'Platforms Used',
       '8. What is the average time you spend on social media every day?':'Time Spent',
       '9. How often do you find yourself using Social media without a specific purpose?':'ADHD Q1',
       '10. How often do you get distracted by Social media when you are busy doing something?':'ADHD Q2',
       "11. Do you feel restless if you haven't used Social media in a while?":'Anxiety Q1',
       '12. On a scale of 1 to 5, how easily distracted are you?':'ADHD Q3',
       '13. On a scale of 1 to 5, how much are you bothered by worries?':'Anxiety Q2',
       '14. Do you find it difficult to concentrate on things?':'ADHD Q4',
       '15. On a scale of 1-5, how often do you compare yourself to other successful people through the use of social media?':'Self Esteem Q1',
       '16. Following the previous question, how do you feel about these comparisons, generally speaking?':'Self Esteem Q2',
       '17. How often do you look to seek validation from features of social media?':'Self Esteem Q3',
       '18. How often do you feel depressed or down?':'Depression Q1',
       '19. On a scale of 1 to 5, how frequently does your interest in daily activities fluctuate?':'Depression Q2',
       '20. On a scale of 1 to 5, how often do you face issues regarding sleep?':'Depression Q3' },inplace=True)
    return df


In [25]:
# Drop functions

def drop(df, col):
    df.drop(col, axis=1, inplace=True)

def dropc(df, condition_index):
    df.drop(condition_index, inplace=True)

def drops(df, cols):
    for col in cols:
        drop(df, col)
    

In [26]:
# Coversion functions

# Converting Male: 0 and Female: 1
def sexc(df):
    df.loc[df['Sex'] == 'Male', 'Sex'] = 0
    df.loc[df['Sex'] == 'Female', 'Sex'] = 1
    df['Sex'] = df['Sex'].astype('int64')
    return df
    
# Converting In a relationship: 0, Single: 1, Married: 2 and Divorced: 3
def relstc(df):
    df.loc[df['Relationship Status'] == 'In a relationship', 'Relationship Status'] = 0
    df.loc[df['Relationship Status'] == 'Single', 'Relationship Status'] = 1
    df.loc[df['Relationship Status'] == 'Married', 'Relationship Status'] = 2
    df.loc[df['Relationship Status'] == 'Divorced', 'Relationship Status'] = 3
    df['Relationship Status'] = df['Relationship Status'].astype('int64')
    return df

# Converting University Student: 0, School Student: 1, Salaried Worker: 2 and Retired: 3
def occupc(df):
    df.loc[df['Occupation'] == 'University Student', 'Occupation'] = 0
    df.loc[df['Occupation'] == 'School Student', 'Occupation'] = 1
    df.loc[df['Occupation'] == 'Salaried Worker', 'Occupation'] = 2
    df.loc[df['Occupation'] == 'Retired', 'Occupation'] = 3
    df['Occupation'] = df['Occupation'].astype('int64')
    return df

# Converting Time Spent Column into integer
def timec(df):
    df.loc[df['Time Spent'] == 'Less than an Hour', 'Time Spent'] = 0
    df.loc[df['Time Spent'] == 'Between 1 and 2 hours', 'Time Spent'] = 1
    df.loc[df['Time Spent'] == 'Between 2 and 3 hours', 'Time Spent'] = 2
    df.loc[df['Time Spent'] == 'Between 3 and 4 hours', 'Time Spent'] = 3
    df.loc[df['Time Spent'] == 'Between 4 and 5 hours', 'Time Spent'] = 4
    df.loc[df['Time Spent'] == 'More than 5 hours', 'Time Spent'] = 5
    df['Time Spent'] = df['Time Spent'].astype('int64')
    return df


In [27]:
# Reordering Columns function

def recols(df):
    titles = list(df.columns)
    titles[10], titles[11] = titles[11], titles[10]
    titles[11], titles[13] = titles[13], titles[11]
    titles[12], titles[13] = titles[13], titles[12]
    df = df[titles]
    return df

In [28]:
# Updating grading system function

def ugsys(df):
    titles = list(df.columns)
    questions = titles[8:]
    for q in questions:
        df.loc[df[q] == 3, q] = 0
        if q == 'Self Esteem Q2':
            df.loc[df[q] == 1, q] = 5
            df.loc[df[q] == 2, q] = 3
            df.loc[df[q] == 4, q] = -2
            df.loc[df[q] == 5, q] = -4
        else:
            df.loc[df[q] == 1, q] = -1
            df.loc[df[q] == 2, q] = 0
            df.loc[df[q] == 4, q] = 3
            
    return df
    

In [29]:
# Merging ADHD, Anxiety, Self Esteem and Depression Scores, Total

def merge(df):
    ADHD = ['ADHD Q1', 'ADHD Q2', 'ADHD Q3', 'ADHD Q4']
    df['ADHD Score'] = df[ADHD].sum(axis=1)
    Anxiety = ['Anxiety Q1', 'Anxiety Q2']
    df['Anxiety Score'] = df[Anxiety].sum(axis=1)
    SelfEsteem = ['Self Esteem Q1', 'Self Esteem Q2','Self Esteem Q3']
    df['Self Esteem Score'] = df[SelfEsteem].sum(axis=1)
    Depression = ['Depression Q1', 'Depression Q2','Depression Q3']
    df['Depression Score'] = df[Depression].sum(axis=1)
    Total = ['ADHD Score', 'Anxiety Score','Self Esteem Score','Depression Score']
    df['Total Score'] = df[Total].sum(axis=1)
    drop(df, df.iloc[:, 8:20])

    return df

In [30]:
# Outcome creation function

def ocf(df):
    def score(s):
      if s < 22.5:
        return "0"
      elif s >= 22.5:
        return "1"
    
    df['Outcome']= df['Total Score'].apply(lambda e: score(e))
    df['Outcome'] = df['Outcome'].astype('int64')

    return df

In [31]:
# Data refining and processing, updating grading system function (Initial)

def irefine(data):
    df = pd.read_csv(data)
    drop(df, 'Timestamp')
    ldropli = ['Age', 'Sex', 'Affiliations', 'Occupation', 'Relationship Status', 'Platforms Used', 'Social Media User?', 'Total Score'] # Columns to drop later
    df = rename(df)
    df['Age'] = df['Age'].astype('int64')
    dropc(df, df.loc[(df['Sex'] != 'Male') & (df['Sex'] != 'Female')].index)
    fns = [sexc, recols, timec, ugsys, merge, ocf, relstc, occupc]
    for e in fns:
        df = e(df)
    drops(df, ldropli)

    return df

In [32]:
# Model: Decided to go with Neural Nets for my front-end
# Other valid options maybe SVC or FNN

def model(df):

    # Split data into training data, and testing data
    X = df.drop(['Outcome'], axis = 1)
    y = df['Outcome']
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2 ,random_state=16)
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5120, 16), random_state=16)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return clf, accuracy

In [33]:
# Run

data = "dataset.csv"
df = irefine(data)
tmodel, accuracy = model(df)
joblib.dump(tmodel, "model.pkl")
print(accuracy)

0.9894736842105263
