In [2]:
!pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=b6414b19926dc1daf83dedc60ac79f7e4a3b2c0608b6983d4268a597595b215e
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import plotly.express as px
import plotly.subplots as sp
from plotly.graph_objs import Scatter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
df=pd.read_csv('/content/loan_approval_dataset.csv')

In [5]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [6]:
df.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


In [8]:
df.isnull().sum()

Unnamed: 0,0
loan_id,0
no_of_dependents,0
education,0
self_employed,0
income_annum,0
loan_amount,0
loan_term,0
cibil_score,0
residential_assets_value,0
commercial_assets_value,0


In [9]:
df.duplicated().sum()

0

In [10]:
# Ensure column names are correct (remove any extra spaces)
df.columns = df.columns.str.strip()

# Check column names again after stripping spaces
print("Columns after stripping spaces:", df.columns)

# Define the numeric columns
numeric_columns = ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
                   'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']

# Check if all numeric columns exist in DataFrame
missing_columns = [col for col in numeric_columns if col not in df.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")

# Convert to numeric, coercing errors to NaN
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values in numeric columns
df.dropna(subset=numeric_columns, inplace=True)

# Calculate Z-scores
df_z_scores = df[numeric_columns].apply(zscore)

# Identify outliers based on Z-score
outliers_z_score = (df_z_scores.abs() > 3).any(axis=1)
outliers_df_z_score = df[outliers_z_score]
print("Rows with outliers based on Z-score:")
print(outliers_df_z_score)

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df[numeric_columns].quantile(0.25)
Q3 = df[numeric_columns].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers based on IQR
outliers_iqr = ((df[numeric_columns] < lower_bound) | (df[numeric_columns] > upper_bound)).any(axis=1)
outliers_df_iqr = df[outliers_iqr]
print("Rows with outliers based on IQR:")
print(outliers_df_iqr)

Columns after stripping spaces: Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')
Rows with outliers based on Z-score:
      loan_id  no_of_dependents      education self_employed  income_annum  \
157       158                 5   Not Graduate           Yes       9900000   
258       259                 0       Graduate            No       9800000   
367       368                 1   Not Graduate           Yes       9400000   
554       555                 3   Not Graduate           Yes       9500000   
714       715                 4   Not Graduate            No       9900000   
892       893                 4       Graduate            No       9300000   
895       896                 0       Graduate            No       9400000   
905       906          

#**Box Plot of Numerical Features by Loan Status**

In [11]:
#Box Plot for 'income_annum' by 'loan_status'
fig = px.box(df, x='loan_status', y='income_annum',
             labels={'loan_status': 'Loan Status', 'income_annum': 'Income Annually'},
             title='Income Annually by Loan Status')
fig.show()

# Box Plot for 'loan_amount' by 'loan_status'
fig = px.box(df, x='loan_status', y='loan_amount',
             labels={'loan_status': 'Loan Status', 'loan_amount': 'Loan Amount'},
             title='Loan Amount by Loan Status')
fig.show()

#**Bar Plot of Loan Status Counts**

In [12]:
# Bar Plot of Loan Status Counts
fig = px.bar(df, x=df['loan_status'].value_counts().index, y=df['loan_status'].value_counts().values,
             labels={'x': 'Loan Status', 'y': 'Count'},
             title='Count of Loan Status')
fig.show()

#**Scatter Plot of Two Numerical Features by Loan Status**

In [13]:
fig = px.scatter(df, x='income_annum', y='loan_amount', color='loan_status',
                 labels={'income_annum': 'Income Annually', 'loan_amount': 'Loan Amount'},
                 title='Income Annually vs Loan Amount by Loan Status')
fig.show()

#**Pair Plot**

In [14]:
fig = sp.make_subplots(rows=3, cols=3, subplot_titles=[
    'Income vs Loan Amount', 'Income vs Loan Term', 'Income vs Cibil Score',
    'Loan Amount vs Loan Term', 'Loan Amount vs Cibil Score', 'Loan Term vs Cibil Score',
    'Income vs Residential Assets', 'Loan Amount vs Residential Assets', 'Loan Term vs Residential Assets'
])

# Add scatter plots to subplots
fig.add_trace(Scatter(x=df['income_annum'], y=df['loan_amount'], mode='markers',
                      marker=dict(color=df['loan_status'].astype('category').cat.codes),
                      name='Income vs Loan Amount'), row=1, col=1)

fig.add_trace(Scatter(x=df['income_annum'], y=df['loan_term'], mode='markers',
                      marker=dict(color=df['loan_status'].astype('category').cat.codes),
                      name='Income vs Loan Term'), row=1, col=2)

fig.add_trace(Scatter(x=df['income_annum'], y=df['cibil_score'], mode='markers',
                      marker=dict(color=df['loan_status'].astype('category').cat.codes),
                      name='Income vs Cibil Score'), row=1, col=3)

fig.add_trace(Scatter(x=df['loan_amount'], y=df['loan_term'], mode='markers',
                      marker=dict(color=df['loan_status'].astype('category').cat.codes),
                      name='Loan Amount vs Loan Term'), row=2, col=1)

fig.add_trace(Scatter(x=df['loan_amount'], y=df['cibil_score'], mode='markers',
                      marker=dict(color=df['loan_status'].astype('category').cat.codes),
                      name='Loan Amount vs Cibil Score'), row=2, col=2)

fig.add_trace(Scatter(x=df['loan_term'], y=df['cibil_score'], mode='markers',
                      marker=dict(color=df['loan_status'].astype('category').cat.codes),
                      name='Loan Term vs Cibil Score'), row=2, col=3)

fig.add_trace(Scatter(x=df['income_annum'], y=df['residential_assets_value'], mode='markers',
                      marker=dict(color=df['loan_status'].astype('category').cat.codes),
                      name='Income vs Residential Assets'), row=3, col=1)

fig.add_trace(Scatter(x=df['loan_amount'], y=df['residential_assets_value'], mode='markers',
                      marker=dict(color=df['loan_status'].astype('category').cat.codes),
                      name='Loan Amount vs Residential Assets'), row=3, col=2)

fig.add_trace(Scatter(x=df['loan_term'], y=df['residential_assets_value'], mode='markers',
                      marker=dict(color=df['loan_status'].astype('category').cat.codes),
                      name='Loan Term vs Residential Assets'), row=3, col=3)

# Update layout
fig.update_layout(title_text='Pair Plots of Numerical Features by Loan Status',
                  showlegend=True)

fig.show()

#**Checking unique values in the category columns**

In [15]:
a=df['education'].unique()
b=df['self_employed'].unique()
c=df['loan_status'].unique()
print(a)
print(b)
print(c)

[' Graduate' ' Not Graduate']
[' No' ' Yes']
[' Approved' ' Rejected']


#**Applying One Hot Encoding on the specific columns**

In [16]:
label_encoder = LabelEncoder()
df['education'] = label_encoder.fit_transform(df['education'])
df['self_employed'] = label_encoder.fit_transform(df['self_employed'])
df['loan_status'] = label_encoder.fit_transform(df['loan_status'])

df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0
1,2,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,1
2,3,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1
3,4,3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,1
4,5,5,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,1


In [17]:
X=df.drop('loan_status',axis=1)
y=df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd

# Initialize Spark session
spark = SparkSession.builder.appName("ClassificationExample").getOrCreate()

# Assuming y_train and y_test are Pandas Series or DataFrames
# Convert y_train and y_test to DataFrame if they're Series
if isinstance(y_train, pd.Series):
    y_train = y_train.to_frame(name="label")
if isinstance(y_test, pd.Series):
    y_test = y_test.to_frame(name="label")

# Combine X_train with y_train
train_data = pd.concat([X_train, y_train], axis=1)

# Combine X_test with y_test
test_data = pd.concat([X_test, y_test], axis=1)

# Convert Pandas DataFrame to Spark DataFrame
spark_train_data = spark.createDataFrame(train_data)
spark_test_data = spark.createDataFrame(test_data)

# Convert feature columns to a list of strings
input_columns = list(X_train.columns)

# Assemble features into a single column
vector_assembler = VectorAssembler(inputCols=input_columns, outputCol="features")
spark_train_data = vector_assembler.transform(spark_train_data)
spark_test_data = vector_assembler.transform(spark_test_data)

# Initialize models
log_reg = LogisticRegression(labelCol="label", featuresCol="features")
random_forest = RandomForestClassifier(labelCol="label", featuresCol="features")
svm = LinearSVC(labelCol="label", featuresCol="features")

# Train models
log_reg_model = log_reg.fit(spark_train_data)
random_forest_model = random_forest.fit(spark_train_data)
svm_model = svm.fit(spark_train_data)

# Make predictions
predictions_log_reg = log_reg_model.transform(spark_test_data)
predictions_random_forest = random_forest_model.transform(spark_test_data)
predictions_svm = svm_model.transform(spark_test_data)

# Evaluate models
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

print("Logistic Regression")
print("Accuracy:", evaluator.evaluate(predictions_log_reg))

print("\nRandom Forest")
print("Accuracy:", evaluator.evaluate(predictions_random_forest))

print("\nSupport Vector Machine")
print("Accuracy:", evaluator.evaluate(predictions_svm))


Logistic Regression
Accuracy: 0.9039812646370023

Random Forest
Accuracy: 0.9613583138173302

Support Vector Machine
Accuracy: 0.9180327868852459
