### ***Required Packages***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score,r2_score,adjusted_rand_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### ***Question_1***

In [None]:
df=pd.read_csv('/content/drive/MyDrive/sem 6/Lab/ML Lab/SOC_LAB1/dataset/KNN/heart_disease_uci.csv')
display(df.head())

**Handle Missing values**

In [None]:
features=['id','age','sex','dataset','cp','trestbps','chol','fbs','restecg','thalch','exang','oldpeak','slope','ca','thal']
target=['num']

le=LabelEncoder()
df['sex']=le.fit_transform(df['sex'])
df['dataset']=le.fit_transform(df['dataset'])
df['cp']=le.fit_transform(df['cp'])
df['fbs']=le.fit_transform(df['fbs'])
df['restecg']=le.fit_transform(df['restecg'])
df['exang']=le.fit_transform(df['exang'])
df['slope']=le.fit_transform(df['slope'])
df['thal']=le.fit_transform(df['thal'])



df['trestbps']=df['trestbps'].fillna(df['trestbps'].mean())
df['chol']=df['chol'].fillna(df['chol'].mean())
df['fbs']=df['fbs'].fillna(df['fbs'])
df['restecg']=df['restecg'].fillna(df['restecg'])
df['thalch']=df['thalch'].fillna(df['thalch'].mean())
df['exang']=df['exang'].fillna(df['exang'])
df['oldpeak']=df['oldpeak'].fillna(df['oldpeak'].mean())
df['thal']=df['thal'].fillna(df['thal'].mean())
df['ca']=df['ca'].fillna(df['ca'].mean())
df['slope']=df['slope'].fillna(df['slope'].mean())

print(df.isnull().sum())

**Data Split ,Scaling ,KNN Model and Metrics**

In [None]:
x=df[features]
y=df[target]

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)


scalers={'MinMax':MinMaxScaler(),'Standard':StandardScaler()}

for name,scaler in scalers.items():
    print(f'Applying {name} Scaling : ')
    scaled_x_train=scaler.fit_transform(X_train)
    scaled_x_test=scaler.transform(X_test)
    for k in [3,5,7,9]:
        knn=KNeighborsClassifier(n_neighbors=k)
        knn.fit(scaled_x_train,y_train)
        y_pred=knn.predict(scaled_x_test)
        print(f"K : {k}")
        print(f'Accuracy Score : {accuracy_score(y_test,y_pred)}')
        print(f'Confusion Matrix : \n{confusion_matrix(y_test,y_pred)}')
        print(f'Classification Report : \n{classification_report(y_test,y_pred)}')

### ***Question_2***

In [None]:
df=pd.read_excel('/content/drive/MyDrive/sem 6/Lab/ML Lab/SOC_LAB1/dataset/KNN/Telco_customer_churn.xlsx')
display(df.head())

**Handle Missing Values**

In [None]:
df.drop(columns=['Churn Reason','CustomerID'],inplace=True)
print(df.isnull().sum())

**Encode Data**

In [None]:
le=LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col]=df[col].astype('str')
    df[col]=le.fit_transform(df[col])

display(df.head())

**Data Split , Model Training and Metrics**

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df.drop(columns=['Churn Value']),df['Churn Value'],test_size=0.2,random_state=42)

scalers=StandardScaler()

X_train_Scaled=scalers.fit_transform(X_train)
X_test_Scaled=scalers.transform(X_test)

knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_Scaled,y_train)
y_pred_knn=knn.predict(X_test_Scaled)

logreg=LogisticRegression()
logreg.fit(X_train_Scaled,y_train)
y_pred_logreg=logreg.predict(X_test_Scaled)

print(f'KNN Accuracy : {accuracy_score(y_test,y_pred_knn)}')
print(f'Logistic Regression Accuracy : {accuracy_score(y_test,y_pred_logreg)}')

print(f"KNN f1 Score : {f1_score(y_test,y_pred_knn)}")
print(f"Logistic Regression f1 Score : {f1_score(y_test,y_pred_logreg)}")

print(f"KNN Classification Report : \n{classification_report(y_test,y_pred_knn)}")
print(f"Logistic Regression Classification Report : \n{classification_report(y_test,y_pred_logreg)}")

### ***Question 3***

In [None]:
df=pd.read_csv('/content/drive/MyDrive/sem 6/Lab/ML Lab/SOC_LAB1/dataset/PCA/all_stocks_5yr.csv')
display(df.head())

In [None]:
features = ['open', 'high', 'low', 'close', 'volume']
data = df[features]

# Handle missing values if any
data = data.dropna()

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)


In [None]:
# Perform PCA and retain 90% variance
pca = PCA(n_components=0.90)
data_pca = pca.fit_transform(data_scaled)

# Number of components required to retain 90% variance
num_components = pca.n_components_
print(f'Number of components to retain 90% variance: {num_components}')

In [None]:
# Plot variance explained by each component
plt.figure(figsize=(8, 5))
plt.plot(range(1, num_components + 1), np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by Components')
plt.show()

In [None]:
# Scatter plot before PCA
sns.pairplot(pd.DataFrame(data_scaled, columns=features), diag_kind='kde')
plt.suptitle('Stock Data Before PCA')
plt.show()


In [None]:
# Scatter plot after PCA
plt.scatter(data_pca[:, 0], data_pca[:, 1], alpha=0.5)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('Stock Data After PCA')
plt.show()

In [None]:
# Create binary classification target (1 if price increases, 0 if it decreases)
df['price_movement'] = np.where(df['close'].shift(-1) > df['close'], 1, 0)
df = df.dropna()  # Remove NaNs that result from shift operation
y = df['price_movement'].values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data_pca, y, test_size=0.2, random_state=42)

# Apply KNN classification
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# Evaluate KNN classification performance
accuracy = accuracy_score(y_test, y_pred)
print(f'KNN Classification Accuracy: {accuracy}')
print('Classification Report:\n', classification_report(y_test, y_pred))

### ***Question 4***

In [None]:
df=pd.read_csv('/content/drive/MyDrive/sem 6/Lab/ML Lab/SOC_LAB1/dataset/KNN/heart_disease_uci.csv')
display(df.head())
df.info()

In [None]:
df=df.drop(columns=['id','ca'])
display(df.head())

for col in df.select_dtypes(include=['object']).columns:
  df[col]=df[col].astype('str')
  df[col]=LabelEncoder().fit_transform(df[col])

display(df.head())

In [None]:
print(df.isnull().sum())

In [None]:
df['trestbps']=df['trestbps'].fillna(df['trestbps'].mean())
df['chol']=df['chol'].fillna(df['chol'].mean())
df['thalch']=df['thalch'].fillna(df['thalch'].mean())
df['oldpeak']=df['oldpeak'].fillna(df['oldpeak'].mean())

**Without PCA**

In [None]:
kmeans=KMeans(n_clusters=5,random_state=42,n_init=10)
y_pred_kmeans=kmeans.fit_predict(df.drop(columns=['num']))

ari=adjusted_rand_score(df['num'],y_pred_kmeans)
print(ari)

**With PCA**

In [None]:
pca=PCA(n_components=2)
pca_x=pca.fit_transform(df.drop(columns=['num']))
y_pred_kmeans=kmeans.fit_predict(pca_x)
ari=adjusted_rand_score(df['num'],y_pred_kmeans)
print(ari)

In [None]:
plt.figure(figsize=(8, 6))
scatter = plt.scatter(pca_x[:, 0], pca_x[:, 1], c=y_pred_kmeans, cmap='viridis', alpha=0.6)
plt.colorbar(scatter, label='Digit Label')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('K Means Clustering')
plt.show()

In [None]:
X=df.drop(columns=['num'])
X_reconstructed = pca.inverse_transform(pca_x)
reconstruction_error = np.mean(np.square(X - X_reconstructed))
print(f"Reconstruction error: {reconstruction_error:.4f}")

### ***Question 5***

In [None]:
df=pd.read_csv('/content/drive/MyDrive/sem 6/Lab/ML Lab/SOC_LAB1/dataset/Regression/Housing.csv')
le=LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
  df[col]=le.fit_transform(df[col])
display(df.head())

In [None]:
plt.scatter(x=df['area'],y=df['price'])
plt.xlabel('area')
plt.ylabel('price')
plt.show()

In [None]:
features=[
    "area", "bedrooms", "bathrooms", "stories", "mainroad", "guestroom",
    "basement", "hotwaterheating", "airconditioning", "parking",
    "prefarea", "furnishingstatus"
]


x=df[['area']]
y=df['price']

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

lg=LinearRegression()
lg.fit(X_train,y_train)

y_pred=lg.predict(X_test)

print(f'r2_Score : {r2_score(y_test,y_pred)}')

In [None]:
y=df[features]
x=df['price']

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

# Reshape X_train to a 2D array
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1) # Reshape X_test as well for consistency

lg=LinearRegression()
lg.fit(X_train,y_train)

y_pred=lg.predict(X_test)

print(f'r2_Score : {r2_score(y_test,y_pred)}')

In [None]:
features=[
    "area", "bedrooms", "bathrooms", "stories", "mainroad", "guestroom",
    "basement", "hotwaterheating", "airconditioning", "parking",
    "prefarea", "furnishingstatus"
]
X=df[features]
vif_data = pd.DataFrame()
vif_data["Feature"] = features
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(features))]
print("\nVariance Inflation Factor (VIF):")
print(vif_data)