# Investation-Prediction

Analaisis ini merupakan pemenuhan tugas survey aps lanjutan setelah survey. Pada kasus ini akan dilakukan prediksi apakah mahasiswa akan melakukan investasi dimasa mendatang dilihat dari jawaban hasil survey sebelumnya.

## 1.1) Importing Libraries

In [None]:
# Ignore warnings :
import warnings
warnings.filterwarnings('ignore')


# Handle table-like data and matrices :
import numpy as np
import pandas as pd
import math 



# Modelling Algorithms :

# Classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis


#Modelling
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import StratifiedKFold, learning_curve, cross_val_score
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
!pip install catboost
from catboost import CatBoostClassifier
from sklearn import metrics 
from sklearn.model_selection import GridSearchCV

#preprocessing :
from sklearn.preprocessing import MinMaxScaler , StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Modelling Helpers :
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV , KFold , cross_val_score



#preprocessing :
from sklearn.preprocessing import MinMaxScaler , StandardScaler, Imputer, LabelEncoder



#evaluation metrics :

# Regression
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error 

# Classification
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  



# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import missingno as msno



# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
plt.style.use('fivethirtyeight')
sns.set(context="notebook", palette="dark", style = 'whitegrid' , color_codes=True)
params = { 
    'axes.labelsize': "large",
    'xtick.labelsize': 'x-large',
    'legend.fontsize': 20,
    'figure.dpi': 150,
    'figure.figsize': [25, 7]
}
plt.rcParams.update(params)

## 1.2) Extract Dataset
* Specify the location to the Dataset and Import them.

In [None]:
df = pd.read_excel('../input/surveyaps/Data Survey Klasifikasi.xlsx')
diamonds = df.copy()

In [None]:
# How the data looks
df.head()

## 1.3) Drop columns that have much missing value and columns id as we already have Index.

In [None]:
df.drop(['id', 'P18', 'P37', 'P38', 'P40'] , axis=1 , inplace=True)
df.head()

In [None]:
df.shape

In [None]:
df.info()

## 1.4) Examine NaN Values

In [None]:
# It seems there are no Null Values.
# Let's Confirm
df.isnull().sum()

In [None]:
msno.matrix(df) # just to visualize. no missing values.

### Great, So there are no NaN values.

In [None]:
df.describe()

<a id="there_you_go_3"></a>
# 2. Visualization Of All Features

## 2.1) Label dan Jenis Kelamin

In [None]:
plt.rcParams['figure.figsize'] = (18, 7)

plt.subplot(1, 2, 1)
sns.countplot(df['Y'], palette = 'muted')
plt.title('Invest atau tidak', fontsize = 30)
plt.xlabel('Invest', fontsize = 15)
plt.ylabel('count', fontsize = 15)

# checking the Distribution of customers on Weekend
plt.subplot(1, 2, 2)
sns.countplot(df['P1'], palette = 'dark')
plt.title('Jenis Kelamin', fontsize = 30)
plt.xlabel('Jenis Kelamin', fontsize = 15)
plt.ylabel('count', fontsize = 15)

## 2.2) Usia

In [None]:
sns.countplot(df['P2'])
plt.title('Usia', fontsize=15)
plt.xlabel('Usia', fontsize = 15)
plt.show()

## 2.3) Domisili

In [None]:
sns.countplot(df['P3'], order = df['P3'].value_counts().index)
plt.title('Domisili', fontsize=15)
plt.xlabel('Domisili', fontsize = 15)
plt.show()

## 2.4) Rumpun Keilmuan

In [None]:
sns.countplot(y=df['P5'],  order = df['P5'].value_counts().index)
plt.title('Rumpun Keilmuan', fontsize=15)
plt.ylabel('Rumpun Keilmuan', fontsize = 15)
plt.show()

## 2.5) Semester

In [None]:
sns.countplot(df['P6'])
plt.title('Semester', fontsize=15)
plt.xlabel('Semester', fontsize = 15)
plt.show()

## 2.6) Residence

In [None]:
sns.countplot(df['P7'])
plt.title('Residence', fontsize=15)
plt.xlabel('Residence', fontsize = 15)
plt.show()

<a id="there_you_go_5"></a>
# 3) Feature Encoding

* **Label the Categorical Features with digits to Distinguish.**
* **As we can't feed String data for Modelling.**

In [None]:
label_encode = LabelEncoder()
angka=['P2', 'P6', 'P16', 'P39']
for i in df.columns[:]:
    if i not in angka:
        df[i]=label_encode.fit_transform(df[i])
print(df.head())        

<a id="there_you_go_2"></a>
# 4) Correlation Between Features

In [None]:
# Correlation Map
corr = df.corr()
sns.heatmap(data=corr, square=True , annot=True, cbar=True)

<a id="there_you_go_6"></a>
# 5) Feature Scaling

* **Divide the Dataset into Train and Test, So that we can fit the Train for Modelling Algos and Predict on Test.**
* **Then Apply Feature Scaling although it's not neccessary in this case. But it surely helps.**

In [None]:
# Split the data into train and test.

In [None]:
X = df.drop(['Y'], axis=1)
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=66)

In [None]:
# Applying Feature Scaling ( StandardScaler )
# You can also Apply MinMaxScaler.

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

<a id="there_you_go_7"></a>
# 6) Modelling Algos

 ## 6.1) Logistic Regression

In [None]:
clf_lr = LogisticRegression()
clf_lr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_lr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_lr.predict(X_test)
y_prob = clf_lr.predict_proba(X_test)[:, 1]
r2 = r2_score(y_test, y_pred)
print('')
print('####### Logistic Regression #######')

# evaluating the model
print("Training Accuracy : %.4f" % clf_lr.score(X_train, y_train))
print("Testing Accuracy : %.4f" % clf_lr.score(X_test, y_test))
print('F1-Score : %.4f'% metrics.f1_score(y_test, y_pred, average = 'macro'))
print('Cross Validation : %.4f' % accuracies.mean())
print('R2     : %0.2f ' % r2)

# confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.rcParams['figure.figsize'] = (6, 6)
sns.heatmap(cm ,annot = True)

# classification report
cr = classification_report(y_test, y_pred)
print(cr)

## 6.2) XGBClassifier

In [None]:
clf_rr = XGBClassifier()
clf_rr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_rr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_rr.predict(X_test)
y_prob = clf_rr.predict_proba(X_test)[:, 1]
r2 = r2_score(y_test, y_pred)

print('')
print('####### XGBClassifier #######')

# evaluating the model
print("Training Accuracy : %.4f" % clf_rr.score(X_train, y_train))
print("Testing Accuracy : %.4f" % clf_rr.score(X_test, y_test))
print('F1-Score : %.4f'% metrics.f1_score(y_test, y_pred, average = 'macro'))
print('Cross Validation : %.4f' % accuracies.mean())
print('R2     : %0.2f ' % r2)

# confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.rcParams['figure.figsize'] = (6, 6)
sns.heatmap(cm ,annot = True)

# classification report
cr = classification_report(y_test, y_pred)
print(cr)

# plot feature importance
xgb.plot_importance(clf_rr)
plt.figure(figsize = (16, 12))
plt.show()

## 6.3) RandomForest Classifier

In [None]:
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_rf, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_rf.predict(X_test)
y_prob = clf_rf.predict_proba(X_test)[:, 1]
r2 = r2_score(y_test, y_pred)

print('')
print('####### Random Forest #######')

# evaluating the model
print("Training Accuracy : %.4f" % clf_rf.score(X_train, y_train))
print("Testing Accuracy : %.4f" % clf_rf.score(X_test, y_test))
print('F1-Score : %.4f'% metrics.f1_score(y_test, y_pred, average = 'macro'))
print('Cross Validation : %.4f' % accuracies.mean())
print('R2     : %0.2f ' % r2)

# confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.rcParams['figure.figsize'] = (6, 6)
sns.heatmap(cm ,annot = True)

# classification report
cr = classification_report(y_test, y_pred)
print(cr)

# plot feature importance
xgb.plot_importance(clf_rf)
plt.figure(figsize = (16, 12))
plt.show()

### XGBClassifier gives us the highest R2-Score [ 96% ] and F1-Score [99%].

# Thank You :)