In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2
from sklearn.neighbors import KNeighborsClassifier


In [None]:
df=pd.read_csv('../input/company-bankruptcy-prediction/data.csv')

In [None]:
df.head()


In [None]:
df.columns

In [None]:
df.isnull().values.any()

# Feature Selection 

Univariate Selection

Statistical tests can be used to select those features that have the strongest relationship with the output variable.

The scikit-learn library provides the SelectKBest class that can be used with a suite of different statistical tests to select a specific number of features.

The example below uses the chi-squared (chi²) statistical test for non-negative features to select 10 of the best features from the Mobile Price Range Prediction Dataset.

In [None]:
X = df.iloc[:,1:]
y = df.iloc[:,0]
print(y.shape)
bf=SelectKBest(score_func=chi2, k=10)
fit=bf.fit(X,y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

In [None]:
u=df[['Bankrupt?',' Fixed Assets to Assets',' Cash/Current Liability',' Net Value Growth Rate',' Fixed Assets Turnover Frequency',' Revenue per person',' Total assets to GNP price',' Quick Ratio',' Quick Asset Turnover Rate',' Total Asset Growth Rate',' Research and development expense rate']]
dff=pd.DataFrame(u)
dff

In [None]:
#Analysis

# Target Variable 

In [None]:
sns.countplot(data=dff,x='Bankrupt?')

In [None]:
x = dff.iloc[:,0].values.reshape(-1,1)
y=dff.iloc[:,1:].values

he Imbalanced classification problem is what we face when there is a severe skew in the class distribution of our training data. Okay, the skew may not be extremely severe (it can vary), but the reason we identify imbalanced classification as a problem is because it can influence the performance on our Machine Learning algorithms.

One way the imbalance may affect our Machine Learning algorithm is when our algorithm completely ignores the minority class. The reason this is an issue is because the minority class is often the class that we are most interested in. For instance, when building a classifier to classify fraudulent and non-fraudulent transactions from various observations, the data is likely to have more non-fraudulent transactions than that of fraud — I mean think about it, it would be very worrying if we had an equal amount of fraudulent transactions as non-fraud.

In [None]:
data_x=dff.iloc[:,0]

In [None]:
from collections import Counter 
print(Counter(data_x))

# Random OverSampling

Random Oversampling includes selecting random examples from the minority class with replacement and supplementing the training data with multiple copies of this instance, hence it is possible that a single instance may be selected multiple times.

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler



over_sample=RandomOverSampler()
X_ros, y_ros=over_sample.fit_resample(y,x)
print(Counter(y_ros))


In [None]:
print(X_ros.shape)
print(y_ros.shape)

In [None]:
print(y_ros.reshape(-1,1).shape)

In [None]:
dff.hist(figsize=(20,20),edgecolor='white')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_ros, y_ros, random_state=76)

In [None]:
print(X_train.shape)
print(y_train.shape)

# Training KNN Model 

In [None]:
best_n = 0
best_training = 0
best_test = 0

for i in range(1,10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    
    training = knn.score(X_train, y_train)
    test = knn.score(X_test, y_test)
    
    if test > best_test:
        best_n = i
        best_training = training
        best_test = test

print("best number of neighbors: {}".format(best_n))
print("best training set score : {:.3f}".format(best_training))
print("best test set score: {:.3f}".format(best_test))

I hope you understand this notebook . 
Please don't forget to upvote . It's boost the self confident and also motivate to more work on the kaggle platform . 
Feel free to give any kinds of suggestion . 


HAVE A NICE DAY ! 
