In [14]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def load_data(file_path, sheet_name):
    return pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name)

def calculate_equations(A, C):
    equations = []
    for i in range(A.shape[0]):
        equation = f"{A[i, 0]}x1 + {A[i, 1]}x2 + {A[i, 2]}x3 = {C[i]}"
        equations.append(equation)
    return equations

def calculate_rank(A):
    return np.linalg.matrix_rank(A)

def calculate_cost_vector(A, C):
    # Calculate the pseudo-inverse of matrix A
    A_pseudo_inv = np.linalg.pinv(A)
    
    # Calculate the cost vector using the pseudo-inverse: Cost = A_pseudo_inv * C
    return np.dot(A_pseudo_inv, C)

def mark_category(df):
    # Mark customers as RICH or POOR based on payments
    df['Category'] = np.where(df['Payment (Rs)'] > 200, 'RICH', 'POOR')
    return df

def split_data(df, X_columns, y_column, test_size=0.2, random_state=42):
    # Separate features (X) and target variable (y)
    X = df[X_columns].values
    y = df[y_column].values
    
    # Split the data into training and testing sets
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def create_knn_classifier(k=3):
    # Create KNN classifier
    return KNeighborsClassifier(n_neighbors=k)

def evaluate_model(classifier, X_test, y_test):
    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    print("\nModel Evaluation:")
    print(f"Accuracy: {accuracy:.2f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(class_report)

def main():
    file_path = r"Lab_Session_Data.xlsx"
    sheet_name_purchase = 'Purchase data'
    sheet_name_stock = 'IRCTC Stock Price'

    # A1: Load data and calculate cost vector
    X_purchase = load_data(file_path, sheet_name_purchase)
    A_columns = ['Candies (#)', 'Mangoes (Kg)', 'Milk Packets (#)']
    C_column = 'Payment (Rs)'
    A = X_purchase[A_columns].values
    C = X_purchase[C_column].values.reshape(-1, 1)
    imputer = SimpleImputer(strategy='mean')
    A = imputer.fit_transform(A)
    equations = calculate_equations(A, C)
    rank_A = calculate_rank(A)
    cost_vector = calculate_cost_vector(A, C)

    print("\nEquations:")
    for equation in equations:
        print(equation)
    print("\nDimensionality of vector space of the data:", rank_A)
    print("No. of vectors in A and C are as follows:", np.shape(A)[0], "and", np.shape(C)[0])
    print("The rank of matrix A is:", rank_A)
    cost_columns = ['Cost of Each Product']
    cost_df = pd.DataFrame(data=cost_vector, columns=cost_columns)
    print("\nCost of Each Product:")
    print(cost_df)

    # A2: Calculate model vector for predicting the cost of products
    model_vector = calculate_cost_vector(A, C)
    model_columns = ['Model Vector X']
    model_df = pd.DataFrame(data=model_vector, columns=model_columns)
    print("\nModel Vector X for Predicting the Cost of Each Product:")
    print(model_df)

    # A3: Classify customers
    X_purchase = mark_category(X_purchase)
    X_columns = ['Candies (#)', 'Mangoes (Kg)', 'Milk Packets (#)']
    y_column = 'Category'
    X_train, X_test, y_train, y_test = split_data(X_purchase, X_columns, y_column)
    knn_classifier = create_knn_classifier(k=3)
    knn_classifier.fit(X_train, y_train)
    evaluate_model(knn_classifier, X_test, y_test)

    # A4: Stock price analysis
    X_stock = load_data(file_path, sheet_name_stock)
    mean_price = np.mean(X_stock['Price'])
    variance_price = np.var(X_stock['Price'])
    print("\nMean of Price data:", mean_price)
    print("Variance of Price data:", variance_price)

    wednesday_prices = X_stock[X_stock['Day'] == 'Wednesday']['Price']
    mean_wednesday_price = np.mean(wednesday_prices)
    print("\nMean of Wednesday Price data:", mean_wednesday_price)
    print("Observation: Compare with population mean to note differences.")

    april_prices = X_stock[X_stock['Month'] == 'Apr']['Price']
    mean_april_price = np.mean(april_prices)
    print("\nMean of April Price data:", mean_april_price)
    print("Observation: Compare with population mean to note differences.")

    loss_probability = np.mean(X_stock['Chg%'] < 0)
    print("\nProbability of making a loss over the stock:", loss_probability)
    
    wednesday = X_stock[X_stock['Day'] == 'Wed']
    wednesday_profit_probability = wednesday[wednesday['Chg%']>0]['Chg%'].to_numpy()
    print("\nProbability of making a profit on Wednesday:", len(wednesday_profit_probability)/len(wednesday))

    conditional_profit_probability =  len(wednesday_profit_probability) / len(wednesday)
    print("\nConditional probability of making profit, given that today is Wednesday:", conditional_profit_probability)

if __name__ == "__main__":
    main()



Equations:
20.0x1 + 6.0x2 + 2.0x3 = [386]
16.0x1 + 3.0x2 + 6.0x3 = [289]
27.0x1 + 6.0x2 + 2.0x3 = [393]
19.0x1 + 1.0x2 + 2.0x3 = [110]
24.0x1 + 4.0x2 + 2.0x3 = [280]
22.0x1 + 1.0x2 + 5.0x3 = [167]
15.0x1 + 4.0x2 + 2.0x3 = [271]
18.0x1 + 4.0x2 + 2.0x3 = [274]
21.0x1 + 1.0x2 + 4.0x3 = [148]
16.0x1 + 2.0x2 + 4.0x3 = [198]

Dimensionality of vector space of the data: 3
No. of vectors in A and C are as follows: 10 and 10
The rank of matrix A is: 3

Cost of Each Product:
   Cost of Each Product
0                   1.0
1                  55.0
2                  18.0

Model Vector X for Predicting the Cost of Each Product:
   Model Vector X
0             1.0
1            55.0
2            18.0

Model Evaluation:
Accuracy: 1.00

Confusion Matrix:
[[1 0]
 [0 1]]

Classification Report:
              precision    recall  f1-score   support

        POOR       1.00      1.00      1.00         1
        RICH       1.00      1.00      1.00         1

    accuracy                           1.00     