In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Entrepreneur Competency in University Students

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import warnings  
warnings.filterwarnings('ignore')

In [None]:
entre = pd.read_csv('../input/entrepreneurial-competency-in-university-students/data.csv')
entre.head()

## Data Wrangling

In [None]:
#Features Understanding
list(entre.columns)

## Features Explanation

### EducationSector
    Engineering background or not

### IndividualProject
    If the student builds personal project

### Age
    Age of student

### Gender
    Sex of student

### City
    If the student stays in a city

### Influenced
    If the student is influenced by someone

### Perseverance
    Rating of a student based upon perseverance

### DesireToTakeInitiative
    Rating of a student based upon desire to take initiative

### Competitiveness
    Competitive rating

### SelfReliance
    Self reliance rating

### StrongNeedToAchieve
    Strong need to achieve a goal rating

### SelfConfidence
    Self confidence rating

### GoodPhysicalHealth
    Good physical health rating

### MentalDisorder
    If there is any mental disorder

### KeyTraits
    Key traits of the student

### ReasonsForLack
    Reason for lack of entrepreneurship culture

### y
    Whether the student seems to become a entrepreneur or not

## Check for data types

In [None]:
#check for dtypes
entre.dtypes
#they are in correct dtypes

### Check For NAN

In [None]:
#check for nan
entre.isnull().sum()

### NAN
    no nan except in ReasonsForLack features, 91 NAN and this is not good because we only have 219 samples
    it means 41% of total samples
    for the time being we should just remove this feature and see what happen with our predictive model later

In [None]:
entre.drop('ReasonsForLack', axis=1, inplace=True) #drop reason for lack
#okay we are done with data wrangling

## Exploratory Data Analysis

In [None]:
#first thing first lets classify the features to numerical and categorical type

numerical_features = ['Age']

categorical_features = [i for i in list(entre.columns) if i != 'Age' and i != 'y']

target = entre['y']

#only 1 categorical features
#the rest is either categorical or ordinal discreet (still categorical but discreet and has clear order)

## Univariate Analysis

### Descriptive Statistic
### Numerical

In [None]:
entre[numerical_features].describe()

#### Age
    219 values
    Minimum age is 17
    maximum age is 26,  this is data  represent university student age
    mean of age is 19.75
    median is 20
    Since median and mean is close, the age feature might not have outliers 
    but since median is still slightly higer than mean, the data distribution might be bell-shaped but right-skewed with short tail
    standard deviation is 1.28, if our age feature follows normal distribution, then 68% of age will be around 18.47-21.03, lets say around 18-21 years old, and 95% of age will be around 17-22 years old
    

## Age Distribution
### Histogram
### Boxplot

In [None]:
#Age histogram
entre[numerical_features].hist(bins=entre['Age'].nunique())
plt.show()

In [None]:
entre[numerical_features].boxplot()
#ax.set_xticklabels(list(heart_attack[numerical_features].columns))
plt.show()

### Distribution Analysis
    our histogram shows that age features is bell-shape with right-skewed and short tail
    Turns out there are outliers in our dataset
    probably because age values 17 and 22-26 is so little compare to other age values
    lets see the bar chart

In [None]:
#Barplot of age
#count the value inside numerical variable and plot
#since our nuemrical feature is only one, we use value_counts instead of groupby
data = entre[numerical_features].value_counts().to_frame().rename(columns={0 : 'Counts'})
index = [i[0] for i in data.index]
ax = plt.bar(x=index, height=data['Counts'])
for bar in ax:
    yval = bar.get_height()
    plt.text(bar.get_x() + 0.12, yval + 0.7, yval) #parameter = x position, y position, and string
plt.show()

### Outliers
    I decide to drop outliers above age 22
 
### Results below

In [None]:
entre = entre[entre['Age'] <= 22]
entre[numerical_features].hist(bins=entre['Age'].nunique())
plt.show() #better

## Categorical Data

### Countplot

In [None]:
#Categorical features 
#starts with this question, what is the dominant value of each cat features?

fig = plt.figure(figsize=(15, 20))
for x in range(1,len(list(entre[categorical_features].columns))):
    ax = plt.subplot(7, 2, x)
    sns.countplot(entre[categorical_features[x]]) #, textprops={'fontsize': 20}
    plt.title(categorical_features[x], fontsize=20)
    plt.xlabel('')
    plt.tight_layout()
#fig.suptitle(year[x], fontsize=16, y=1.03)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
sns.countplot(y = entre[categorical_features[0]])
plt.show()

## Categorical Univariate Analysis
    1. most of respondents are from engineering science major
    2. eventhough by small margin, more students have individual project than not
    3. There are more Male students than female, is it because the majority is engineering science students? have to check later
    4. more students stay in city than not
    5. most students are influenced by someone 

## Below is categorical data from survey given to students
    1. More students answer 3 or 4 in:
    Perseverance, Desire to take initiative, Competitiveness, self reliance, self confidence, good physical health
    2. Most Students feel the strong need to achieve something
    4. 2 of the most strong key traits for students are positivity and passion

## Target
### Balance Analysis


In [None]:
plt.figure(figsize=(7,5))
sns.countplot(target)
plt.show()

In [None]:
#fig, ax = plt.subplots(figsize=(10, 8))

translate_output = {'output' : {1: 'Become Entrepreneur',  0 : 'Not Entrepreneur'}}

# Pie chart
labels = list(target.value_counts().reset_index()['index'])
labels = [translate_output['output'][i] for i in labels]
sizes = list(target.value_counts().reset_index()['y'])

#colors
colors = ['#ff9999','#66b3ff']
#explsion
explode = (0.05,0.05)
fig, ax = plt.subplots(figsize=(8, 5)) 
ax.pie(sizes, colors = colors, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode = explode)
#draw circle
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax.axis('equal')  
plt.tight_layout()
plt.show() 

### Little bit unbalance data

### analysis
    58.4% becomes entrepreneur vs 41.6% not entrepenreur
    it is the fact from the sample
    but this also means
    Imbalance in the target or the dataset itself is imbalance
    16.8% difference is huge. this will affect our model's accuracy

# Bivariate Analysis

## Numerical 

### Pairplot

In [None]:
#try to use pairplot
sns.pairplot(pd.concat([entre[numerical_features], target], axis=1), hue='y')
plt.savefig('num_pairplot.png')
plt.show()

### Heatmap

In [None]:
#heatmap
fig, ax = plt.subplots(figsize=(5, 3))
sns.heatmap(pd.concat([entre[numerical_features], target], axis=1).corr(),annot=True,cmap="Oranges",ax=ax)
plt.savefig('num_heatmap.png')
plt.show()

## Categorical

### Pairplot

In [None]:
#try to use pairplot
fig1 = plt.figure(figsize=(16,16))
sns.pairplot(pd.concat([entre[categorical_features], target], axis=1), hue='y')
plt.savefig('cat_pairplot.png')
plt.show()

## Building Machine Learning Model

### Perform Binary encoding to categorical features with binary traits

In [None]:
#i'm not using sklearn label encoding, doing it manually instead
categorical_features_to_change = ['IndividualProject', 'City', 'Influenced', 'MentalDisorder']

#we need to drop education sector later as it's not serve any purpose because it will make our dataset imbalance
for i in categorical_features_to_change:
    entre[i] = entre[i].apply(lambda x: 1 if 'Yes' else 0)

In [None]:
#for gender
entre['Gender'] = entre['Gender'].apply(lambda x : 0 if 'Male' else 1)

#perform manual label encoding, just like dummy variables in pandas instead of using sklearn
entre = entre.replace({'KeyTraits' : {'Passion' : 0, 'Vision' : 1, 'Resilience' : 2, 'Positivity' : 3, 'Work Ethic' : 4}})

#drop educationsector
entre.drop(['EducationSector'], axis=1, inplace=True)

### Check Dtypes

In [None]:
entre.dtypes #ok everything is in order

## Mutual Information Score

In [None]:
target = entre['y'] #get our output
entre.drop(['y'], axis=1, inplace=True) #drop from entre df
def mi_scores(dataset, target):
    mutual_class = mutual_info_classif(dataset, target, random_state=42)
    mutual_class = pd.Series(mutual_class, name="mutual information scores", index=dataset.columns)
    mutual_class = mutual_class.sort_values(ascending=False)
    return mutual_class

mutual_info_score = mi_scores(entre, target)
mutual_info_score # show a few features with their MI scores

In [None]:
#plot MI
plt.figure(dpi=100, figsize=(8, 5))
mutual_info_score = mutual_info_score.sort_values(ascending=True)
width = np.arange(len(mutual_info_score))
ticks = list(mutual_info_score.index)
plt.barh(width, mutual_info_score, color= '#ff9999')
plt.yticks(width, ticks)
plt.savefig('mutual_information_score.png')
plt.title("Mutual Information Scores")

### Mutual information score analysis
    Competitiveness has the highest features score
    Self reliance, perseverance, influenced, age, and individual project has no impact
    so we're only going to use 8 features

In [None]:
#features to use
features = [i for i in entre.columns if i not in ['SelfReliance', 'Perseverance', 'Influenced', 'Age', 'Individual Project']]

entre = entre[features]

### Check before proceed further

In [None]:
entre.head()
#as you can see all our features are already in either binary or label encoding form, 
#so no need to transform nor scaling them

### Train Test Split
### I'm not using Cross validation yet

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(entre, target, stratify=target, train_size=0.80)

## using keras deep learning model

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.BatchNormalization(input_shape=[len(list(entre.columns))]),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
#optimizer
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

In [None]:
#early stopping to prevent overfitting
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

In [None]:
#train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=200,
    callbacks=[early_stopping],
)

## Loss and Accuracy Plot

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")
plt.show()

In [None]:
#Best validation loss and validation mae score
print(("Best Validation Loss: {:0.2f}" +"\nBest Validation accuracy: {:0.2f}").format(history_df['val_loss'].min(), history_df['val_binary_accuracy'].max()))