# STAT6340
### NAME:Shradha Upadhyay

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the data
df = pd.read_csv('Smarket.csv')

# Look at the data
print(df.head())

# Structure of the dataframe
print(df.info())

In [None]:
# Check the distribution of the 'Year' column
print(df['Year'].value_counts())

# Summary statistics
print(df.describe())

# Count of negative vs non-negative 'Today' values
print("Count of 'Today' < 0:", (df['Today'] < 0).sum())
print("Count of 'Today' >= 0:", (df['Today'] >= 0).sum())

# Pair plot (can be slow with large datasets)
pd.plotting.scatter_matrix(df, figsize=(10, 10))
plt.show()

In [None]:
# Calculate correlation matrix excluding 'Direction' column
correlation_matrix = df.iloc[:, :-1].corr()
print(correlation_matrix.round(2))

In [None]:
# Use 'Year' column to create training and test masks
train_mask = df['Year'] < 2005

# Prepare training data
train_X = df.loc[train_mask, ['Lag1', 'Lag2']].values
train_y = df.loc[train_mask, 'Direction'].values

# Prepare test data
test_X = df.loc[~train_mask, ['Lag1', 'Lag2']].values
test_y = df.loc[~train_mask, 'Direction'].values

# Plot the training data
plt.scatter(train_X[:, 0], train_X[:, 1], c=np.where(train_y == 'Up', 'green', 'red'))
plt.xlabel('Lag1')
plt.ylabel('Lag2')
plt.show()

In [None]:
# KNN with k=1
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train_X, train_y)

# Predictions on training data
train_pred = knn.predict(train_X)
print("Training Error Rate:", 1 - accuracy_score(train_y, train_pred))

# Predictions on test data
test_pred = knn.predict(test_X)
print("Test Error Rate:", 1 - accuracy_score(test_y, test_pred))

# Confusion Matrix for Test Data
print(pd.crosstab(test_pred, test_y, rownames=['Predicted'], colnames=['Actual']))

In [None]:
ks = list(range(1, 31)) + list(range(35, 101, 5))
train_error_rates = []
test_error_rates = []

for k in ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(train_X, train_y)
    
    train_pred = knn.predict(train_X)
    test_pred = knn.predict(test_X)
    
    train_error_rates.append(1 - accuracy_score(train_y, train_pred))
    test_error_rates.append(1 - accuracy_score(test_y, test_pred))

# Plot the error rates
plt.plot(ks, train_error_rates, label='Training Error Rate', color='blue', marker='o')
plt.plot(ks, test_error_rates, label='Test Error Rate', color='purple', marker='o')
plt.xlabel('Number of Nearest Neighbors')
plt.ylabel('Error Rate')
plt.legend()
plt.show()

# Best k based on the minimum test error rate
optimal_k = ks[np.argmin(test_error_rates)]
print(f'Optimal k: {optimal_k}')

In [None]:
# Create a grid for plotting the decision boundary
x1_grid = np.linspace(train_X[:, 0].min(), train_X[:, 0].max(), 50)
x2_grid = np.linspace(train_X[:, 1].min(), train_X[:, 1].max(), 50)
xx, yy = np.meshgrid(x1_grid, x2_grid)
grid = np.c_[xx.ravel(), yy.ravel()]

# Fit KNN with the optimal k
knn = KNeighborsClassifier(n_neighbors=optimal_k)
knn.fit(train_X, train_y)
grid_pred = knn.predict(grid)
probabilities = knn.predict_proba(grid)[:, 1].reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, probabilities, levels=[0, 0.5, 1], alpha=0.3)
plt.scatter(train_X[:, 0], train_X[:, 1], c=np.where(train_y == 'Up', 'green', 'red'))
plt.xlabel('Lag1')
plt.ylabel('Lag2')
plt.show()

In [None]:
# Naive approach: Predict 'Up' for every day
naive_pred = np.array(['Up'] * len(test_y))

# Test error rate for naive prediction
naive_error_rate = 1 - accuracy_score(test_y, naive_pred)
print(f"Naive Prediction Test Error Rate: {naive_error_rate}")