In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv("../input/startup-success-prediction/startup data.csv")

df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.drop(["Unnamed: 0","Unnamed: 6","id","state_code.1","latitude","longitude","zip_code","city","name",
       "closed_at","founded_at","first_funding_at","last_funding_at","object_id"], axis=1, inplace=True)

df.head(3)

In [None]:
df=pd.get_dummies(df, columns=["status"], drop_first=True)
df.head(2)

In [None]:
df.isnull().sum().sort_values(ascending=False).head(7)

# we have missing values only in 2 columns. let's fill it

In [None]:
df.age_first_milestone_year.fillna(df["age_first_milestone_year"].median(), inplace=True)
df.age_last_milestone_year.fillna(df["age_last_milestone_year"].median(), inplace=True)

# Let's First Apply PCA

In [None]:
# Define X and Y variable 

X=df.drop(["state_code","category_code","status_closed"], axis=1)
Y=df.status_closed

In [None]:
X=StandardScaler().fit_transform(X)

pca=PCA(n_components=32)

X_pca=pca.fit_transform(X)
exp_var=pca.explained_variance_ratio_
cumsum_var=np.cumsum(exp_var)
cumsum_var
plt.plot(cumsum_var)
plt.grid()

In [None]:
pca_new=PCA(n_components=17)
X_new=pca_new.fit_transform(X)

exp_var_new=pca_new.explained_variance_ratio_
cumsum_var_new=np.cumsum(exp_var_new)

plt.plot(cumsum_var_new)
plt.grid()
X_new=pd.DataFrame(X_new)
X_new.head(3)

# I have chosen first 17 features in this dataset

# Logistic Regression with PCA

In [None]:
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix


# With X_new variable, continue to apply machine learning algorithm
X_train, X_test, Y_train, Y_test = train_test_split(X_new, Y, test_size=0.30, random_state=42)


log_reg=LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5)
log_reg.fit(X_train,Y_train)

y_test_pred=log_reg.predict(X_test)
y_train_pred=log_reg.predict(X_train)

print("Accuracy of the test set:", accuracy_score(Y_test,y_test_pred))
print("Accuracy of the train set:", accuracy_score(Y_train,y_train_pred))

print(classification_report(Y_test,y_test_pred))
print(confusion_matrix(Y_test,y_test_pred))

# Random Forest with PCA

In [None]:
forest=RandomForestClassifier(n_estimators=20,
                             max_depth=6,
                             criterion="gini",
                             )

forest.fit(X_train,Y_train)
y_test_pred_forest=forest.predict(X_test)
y_train_pred_forest=forest.predict(X_train)

print("Classification Report:","\n",classification_report(Y_test,y_test_pred_forest),"\n")
print("Confusion Matrix","\n",confusion_matrix(Y_test,y_test_pred_forest),"\n")
print("Accuracy score of random forest test set:",accuracy_score(Y_test,y_test_pred_forest))
print("Accuracy score of random forest train set:",accuracy_score(Y_train,y_train_pred_forest))

In [None]:
plt.figure(figsize=(8,6), dpi=100)
fea_imp=pd.Series(data=forest.feature_importances_, index=X_new.columns)
fea_imp=fea_imp.sort_values(ascending=False)
fea_imp.plot(kind="barh");

In [None]:
# GridSearchCV

param_forest={"n_estimators":np.arange(5,30,5),
             "max_depth":np.arange(1,7,1),
              "criterion":["gini","entropy"]
             }

grid_forest=GridSearchCV(estimator=forest,
                        param_grid=param_forest,
                        cv=10,
                         n_jobs=-1,
                        return_train_score=True
                        )

grid_forest.fit(X_train,Y_train)

print(grid_forest.best_params_)
print(grid_forest.best_score_)

In [None]:
results_forest=pd.DataFrame(grid_forest.cv_results_)
results_forest[["param_criterion","param_max_depth",
               "param_n_estimators","mean_test_score"]].sort_values(by="mean_test_score", ascending=False).head()

# KNN with PCA

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn=KNeighborsClassifier(n_neighbors=5
                        )

knn.fit(X_train,Y_train)
y_test_pred_knn=knn.predict(X_test)
y_train_pred_knn=knn.predict(X_train)

print("Accuracy of test set with the knn :", accuracy_score(Y_test,y_test_pred_knn))
print("Accuracy of train set with knn :", accuracy_score(Y_train,y_train_pred_knn),"\n")
print("Confusion matrix:","\n", confusion_matrix(Y_test,y_test_pred_knn),"\n")
print("Classification report: ","\n",classification_report(Y_test,y_test_pred_knn))

In [None]:
from sklearn.neighbors import NearestNeighbors

A=knn.kneighbors_graph(X_train,n_neighbors=2)
plt.spy(A, marker="*")
plt.show()

# SVC with PCA

In [None]:
from sklearn.svm import SVC

svc=SVC()

svc.fit(X_train,Y_train)
y_test_pred_svc=svc.predict(X_test)
y_train_pred_svc=svc.predict(X_train)

print("Accuracy of train set with SVC:", accuracy_score(Y_test,y_test_pred_svc))
print("Accuracy of train set with SVC:", accuracy_score(Y_train,y_train_pred_svc))
print("Classification report:", "\n", classification_report(Y_test,y_test_pred_svc))
print("Confusion matrix:", confusion_matrix(Y_test,y_test_pred_svc))

# XGBOOST with PCA

In [None]:
from xgboost import XGBClassifier

xgboost=XGBClassifier(objective='binary:logistic',
                     n_estimators=5)

xgboost.fit(X_train,Y_train)

y_test_pred_xgboost=xgboost.predict(X_test)
y_train_pred_xgboost=xgboost.predict(X_train)

print("Accuracy of test set with XGBOOST:", accuracy_score(Y_test,y_test_pred_xgboost))
print("Accuracy of train set with XGBOOST:", accuracy_score(Y_train,y_train_pred_xgboost),"\n")
print("Confusion matrix:", "\n", confusion_matrix(Y_test,y_test_pred_xgboost))
print("Classification report:", "\n", classification_report(Y_test,y_test_pred_xgboost))

### As a result of, with only 17 features we have very high accuracy most of algorithms for prediction

In [None]:
log=accuracy_score(Y_test,y_test_pred)
forest=accuracy_score(Y_test,y_test_pred_forest)
knn=accuracy_score(Y_test,y_test_pred_knn)
svc=accuracy_score(Y_test,y_test_pred_svc)
xgboost=accuracy_score(Y_test,y_test_pred_xgboost)

log_df=pd.Series(log)
forest_df=pd.Series(forest)
knn_df=pd.Series(knn)
svc_df=pd.Series(svc)
xgboost_df=pd.Series(xgboost)

sns.set(style = "darkgrid" , font_scale = 1.2)
plt.bar("Log", height=log_df)
plt.bar("Random Forest", height=forest_df)
plt.bar("KNN", height=knn_df)
plt.bar("SVC", height=svc_df)
plt.bar("XGBOOST", height=xgboost_df)
plt.xticks(rotation=45);

# The algorithm that have highest accuracy is SVC and then XGBOOST