In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Part A

**Q1-Q5**

In [4]:
data = pd.read_csv("/content/data_for_large_scale.csv")

X = data.iloc[:, :-1]
Y = data.iloc[:, -1]

X_array = np.array(X)
Y_array = np.array(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X_array, Y_array, test_size = 0.3, random_state = 10)

X_train_reshaped = np.array_split(X_train, len(X_train)//90)
X_test_reshaped = np.array_split(X_test, len(X_test)//90)
Y_train_reshaped = Y_train[:len(X_train_reshaped)*90]
Y_test_reshaped = Y_test[:len(X_test_reshaped)*90]

sgd = SGDRegressor(random_state = 10)

coefs = []

for i in range(len(X_train_reshaped)):
  sgd.partial_fit(X_train_reshaped[i], Y_train[(i*90):(i+1)*90])
  coefs.append(sgd.coef_[4])

In [53]:
coefs[4]

sgd.coef_

sgd.intercept_

Y_pred = np.hstack([sgd.predict(i) for i in X_test_reshaped])

mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

r2

56.05563939842212

**Q6-Q8**

In [91]:
datalink = "https://archive.ics.uci.edu/ml/machine-learning-databases/00246/3D_spatial_network.txt"

data = pd.read_csv(datalink, names = ['OSM_id', 'LONGITUDE', 'LATITUDE', 'ALTITUDE'], chunksize = 20000, iterator = True)

In [60]:
from sklearn.preprocessing import StandardScaler

In [89]:
scaler = StandardScaler()
sgd = SGDRegressor(random_state = 10)

In [90]:
for chunk in data:
  scaler.partial_fit(chunk)

In [92]:
intercepts = []
coefs = []

In [93]:
for chunk in data:
  chunk_scaled = scaler.transform(chunk)
  sgd.partial_fit(chunk_scaled, chunk['ALTITUDE'])
  intercepts.append(sgd.intercept_)
  coefs.append(sgd.coef_[1])

In [96]:
coefs

[0.0007290956791143449,
 2.0884085001975475e-05,
 0.0001638935325536048,
 0.0004363320803436531,
 0.00012914319203397286,
 0.0004855514822103179,
 0.0005988484731357072,
 0.00041424921976848374,
 0.0005447755680819508,
 0.00038341522561841627,
 -7.319172599376467e-05,
 0.00041503543870966074,
 0.0004466188580924832,
 0.0004816139609003369,
 -0.00012742319209731762,
 -5.727518980908614e-05,
 0.00012701646956260517,
 0.0003818907591664188,
 0.00048794894428996093,
 0.00040233152878901466,
 7.323034390687191e-05,
 0.00037171267531807697]

In [72]:
sample_count = 0

for chunk in data:
  sample_count += len(chunk)

sample_count

434874

**Q9-Q11**

In [99]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score

In [98]:
data = load_iris()

In [100]:
X = data.data
Y = data.target

In [101]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 10)

In [102]:
normalizer = Normalizer()

In [103]:
X_train_scaled = normalizer.fit_transform(X_train)
X_test_scaled = normalizer.transform(X_test)

In [113]:
knn = KNeighborsClassifier(n_neighbors=3)

In [114]:
knn.fit(X_train_scaled, Y_train)
Y_pred = knn.predict(X_test_scaled)

In [115]:
print(accuracy_score(Y_test, Y_pred))

0.9666666666666667


In [116]:
from sklearn.metrics import f1_score

In [117]:
print(f1_score(Y_test, Y_pred, average = "weighted"))

0.9671111111111111


# Part B

**Q1**

In [121]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [123]:
def compute_GridSearchCV(kernels, regularization):
  data = load_iris()
  X = data.data
  Y = data.target

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)

  model = SVC(gamma = 'auto', random_state = 0)

  param_grid = {
      'kernel': kernels,
      'C' : regularization
  }

  grid_search = GridSearchCV(estimator = model, param_grid = param_grid, cv = 4)
  grid_search.fit(X_train, Y_train)

  return grid_search.best_score_

In [125]:
kernels = ['linear', 'rbf']
regularization = [1, 15, 25]

In [126]:
print(compute_GridSearchCV(kernels, regularization))

0.9807692307692308


**Q2-Q3**

In [127]:
data = pd.read_csv("/content/Social_Network_Ads.csv")

In [135]:
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [136]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [137]:
svc = SVC(kernel = 'linear', random_state = 0)

In [138]:
svc.fit(X_train_scaled, Y_train)
Y_pred = svc.predict(X_test_scaled)

In [139]:
accuracy_score(Y_test, Y_pred)

0.9

In [140]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(Y_test, Y_pred))

[[66  2]
 [ 8 24]]


**Q4-Q5**

In [141]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline

In [142]:
mnist = fetch_openml('mnist_784', version=1, as_frame = False)
X, Y = mnist.data, mnist.target

In [143]:
X_train, X_test = X[:20000], X[20000:25000]
Y_train, Y_test = Y[:20000], Y[20000:25000]

In [145]:
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('svc', SVC(kernel = 'linear', decision_function_shape='ovr',class_weight=None))
])

In [146]:
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)

In [152]:
print(confusion_matrix(Y_test, Y_pred).trace())

4623


In [148]:
from sklearn.metrics import classification_report

In [151]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96       478
           1       0.95      0.98      0.97       568
           2       0.92      0.92      0.92       521
           3       0.91      0.90      0.90       516
           4       0.91      0.94      0.92       500
           5       0.88      0.88      0.88       460
           6       0.97      0.96      0.96       491
           7       0.91      0.96      0.94       504
           8       0.93      0.85      0.89       466
           9       0.92      0.88      0.90       496

    accuracy                           0.92      5000
   macro avg       0.92      0.92      0.92      5000
weighted avg       0.92      0.92      0.92      5000



**Q6**

In [157]:
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist.data, mnist.target

# Split the data into 50:50 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Create a pipeline with StandardScaler and SVM with poly kernel
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='poly', degree=3, decision_function_shape='ovr', class_weight='balanced', C=10))
])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred, output_dict=True)
weighted_f1_score = report['weighted avg']['f1-score']

print("Weighted Average F1-Score:", weighted_f1_score)

Weighted Average F1-Score: 0.9723258740744227


**Q7**

In [155]:
data = load_iris()

X = data.data
Y = data.target

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

svm = SVC(kernel = 'poly', C = 10, gamma = 'auto')

svm.fit(X_train, Y_train)
Y_pred = svm.predict(X_test)

accuracy_score(Y_test, Y_pred)

1.0

**Q8**

In [156]:
data = load_iris()

X = data.data
Y = data.target

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

svm = SVC(kernel = 'sigmoid', C = 25, gamma = 'auto')

svm.fit(X_train, Y_train)
Y_pred = svm.predict(X_test)

accuracy_score(Y_test, Y_pred)

0.28888888888888886

**Q9**

In [None]:
data = load_iris()

In [None]:
data = data.drop(data[])

In [None]:
X = data.data
Y = data.target

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

svm = SVC(kernel = 'poly', C = 10, gamma = 'auto')

svm.fit(X_train, Y_train)
Y_pred = svm.predict(X_test)

accuracy_score(Y_test, Y_pred)