In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from scipy.sparse.linalg import svds

# Load the dataset
file_path = "C:\Codding\Data Science\PROJECT\Data_ML_project.csv"  # Replace with your file path
data = pd.read_csv(file_path)

# Step 1: Select relevant columns and preprocess
data_subset = data[["University", "Course Name", "Course rating"]]

# Convert "Course rating" to numeric and handle missing values
data_subset["Course rating"] = pd.to_numeric(data_subset["Course rating"], errors='coerce')
imputer = SimpleImputer(strategy="mean")
data_subset["Course rating"] = imputer.fit_transform(data_subset[["Course rating"]])

# Encode categorical columns (University and Course Name)
label_encoders = {}
for col in ["University", "Course Name"]:
    le = LabelEncoder()
    data_subset[col] = le.fit_transform(data_subset[col])
    label_encoders[col] = le

# Step 2: Handle duplicate entries by averaging ratings
data_aggregated = data_subset.groupby(["University", "Course Name"], as_index=False).mean()

# Pivot data into a matrix with Universities as rows, Courses as columns, and ratings as values
ratings_matrix = data_aggregated.pivot(index="University", columns="Course Name", values="Course rating").fillna(0)

# Step 3: Apply SVD
U, sigma, Vt = svds(ratings_matrix, k=5)  # Keep 5 latent features
sigma = np.diag(sigma)

# Reconstruct the ratings matrix (approximation)
reconstructed_matrix = np.dot(np.dot(U, sigma), Vt)

# Step 4: Recommendations
# Convert reconstructed matrix back to a DataFrame
reconstructed_df = pd.DataFrame(reconstructed_matrix, index=ratings_matrix.index, columns=ratings_matrix.columns)

# Example: Recommend top courses for a specific University
university_index = 0  # Replace with the index of the university you're interested in
original_ratings = ratings_matrix.iloc[university_index]
predicted_ratings = reconstructed_df.iloc[university_index]

# Get top recommended courses
recommendations = (predicted_ratings - original_ratings).sort_values(ascending=False)
print("Top recommended courses for University index", university_index)
print(recommendations.head(5))


  file_path = "C:\Codding\Data Science\PROJECT\Data_ML_project.csv"  # Replace with your file path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_subset["Course rating"] = pd.to_numeric(data_subset["Course rating"], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_subset["Course rating"] = imputer.fit_transform(data_subset[["Course rating"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas

TypeError: type not understood

In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Sample Dataset
data = pd.read_csv("C:\Codding\Data Science\PROJECT\Data_ML_project.csv")

# Encode categorical variables
le_uni = LabelEncoder()
data['University'] = le_uni.fit_transform(data['University'])

le_education = LabelEncoder()
data['Parental_Education_Level'] = le_education.fit_transform(data['Parental_Education_Level'])

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['Aptitude Score', 'Family_Income', 'Previous Score', 'Difficulty Level']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Create interaction matrix (Course rating as the matrix values)
interaction_matrix = data[['Course Name', 'Course rating']].set_index('Course Name').T

# Fill missing ratings with 0
interaction_matrix = interaction_matrix.fillna(0)

# Apply SVD
U, sigma, Vt = np.linalg.svd(interaction_matrix, full_matrices=False)

# Reduce dimensions
k = 2
U_k = U[:, :k]
sigma_k = np.diag(sigma[:k])
Vt_k = Vt[:k, :]

# Reconstruct the interaction matrix
predicted_matrix = np.dot(np.dot(U_k, sigma_k), Vt_k)
predicted_df = pd.DataFrame(predicted_matrix, index=interaction_matrix.index, columns=interaction_matrix.columns)

print("Predicted Ratings:")
print(predicted_df)

# Content-based similarity (student features vs. course features)
# Combine features into a single vector
course_features = data[numerical_features + ['University', 'Parental_Education_Level']]
student_profile = np.array([0.9, 0.5, 0.8, 0.7, 1, 2])  # Example normalized profile

# Compute similarity
similarity_scores = cosine_similarity(course_features, student_profile.reshape(1, -1)).flatten()
data['Similarity'] = similarity_scores

# Combine similarity with predicted ratings
data['Predicted Rating'] = predicted_df.iloc[0].values
data['Final Score'] = 0.7 * data['Predicted Rating'] + 0.3 * data['Similarity']

# Recommend top courses
recommendations = data[['Course Name', 'Final Score']].sort_values(by='Final Score', ascending=False)
print("\nTop Recommendations:")
print(recommendations.head(3))


  data = pd.read_csv("C:\Codding\Data Science\PROJECT\Data_ML_project.csv")


ValueError: could not convert string to float: 'Previous_Scores'

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

# Sample Data: User-Course interaction matrix
data = pd.DataFrame({
    'Course_1': [5, 4, np.nan, 3],
    'Course_2': [np.nan, 3, 4, 2],
    'Course_3': [4, np.nan, 5, np.nan],
    'Course_4': [3, 5, 4, np.nan],
    'Course_5': [np.nan, np.nan, 3, 4]
}, index=['Student_1', 'Student_2', 'Student_3', 'Student_4'])

print("Original Data:")
print(data)

# Step 1: Fill missing values (e.g., with 0 for simplicity)
interaction_matrix = data.fillna(0).values

# Step 2: Perform SVD
U, sigma, Vt = np.linalg.svd(interaction_matrix, full_matrices=False)

# Step 3: Convert sigma (1D array) to a diagonal matrix
sigma = np.diag(sigma)

# Step 4: Reconstruct the matrix using SVD components
reconstructed_matrix = np.dot(np.dot(U, sigma), Vt)

# Step 5: Create a DataFrame for the predicted matrix
predicted_df = pd.DataFrame(reconstructed_matrix, index=data.index, columns=data.columns)

print("\nPredicted Ratings:")
print(predicted_df)

# Step 6: Evaluate (if ground truth is available)
# Example: Calculate RMSE for known ratings
known_values = ~data.isna()
rmse = np.sqrt(mean_squared_error(data[known_values], predicted_df[known_values]))
print(f"\nRMSE on Known Values: {rmse:.4f}")


In [23]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

# Sample Data: User-Course interaction matrix
data = pd.read_csv('C:\Codding\Data Science\PROJECT\Data_ML_project.csv')


  data = pd.read_csv('C:\Codding\Data Science\PROJECT\Data_ML_project.csv')


In [24]:
data.head()

Unnamed: 0,Course Name,University,Aptitude Score,Family_Income,Parental_Education_Level,Previous Score,Extracurricular activity,Difficulty Level,Course rating
0,Acing the Product Management Interview,Advancing Women in Product,71,High,High School,69,Yes,Advanced,5.0
1,The Business of Product Management I,Advancing Women in Product,75,Low,College,65,Yes,Advanced,4.8
2,The Art & Science of Product Management,Advancing Women in Product,62,Medium,High School,63,No,Beginner,5.0
3,Machine Learning Algorithms: Supervised Learni...,Alberta Machine Intelligence Institute,73,Low,College,66,No,Intermediate,4.7
4,Data for Machine Learning,Alberta Machine Intelligence Institute,76,Low,Postgraduate,65,No,Beginner,4.4


In [25]:
# Step 1: Fill missing values (e.g., with 0 for simplicity)
interaction_matrix = data.fillna(0).values

In [26]:
interaction_matrix

array([['Acing the Product Management Interview',
        'Advancing Women in Product', '71', ..., 'Yes', 'Advanced', '5'],
       ['The Business of Product Management I',
        'Advancing Women in Product', '75', ..., 'Yes', 'Advanced',
        '4.8'],
       ['The Art & Science of Product Management',
        'Advancing Women in Product', '62', ..., 'No', 'Beginner', '5'],
       ...,
       ['Gender, Family, and Social Change in Contemporary South Korea',
        'Yonsei University', '94', ..., 'Yes', 'Intermediate', '4.7'],
       ['Deep Learning for Business', 'Yonsei University', '79', ...,
        'Yes', 'Beginner', '4.3'],
       ['Re-imaging God in Korean Context', 'Yonsei University', '87',
        ..., 'Yes', 'Intermediate', '4.7']], dtype=object)

In [28]:
# Step 2: Perform SVD
U, sigma, Vt = np.linalg.svd(data, full_matrices=False)

UFuncTypeError: Cannot cast ufunc 'svd_s' input from dtype('O') to dtype('float64') with casting rule 'same_kind'

In [8]:
U

array([[ 0.52084369,  0.46701117,  0.35831106, -0.61825205],
       [ 0.52830532,  0.45028761, -0.13479683,  0.70708159],
       [ 0.60528967, -0.7568126 ,  0.23516597,  0.07453908],
       [ 0.288513  , -0.07985191, -0.8933868 , -0.33502829]])

In [9]:
sigma

array([11.4454654 ,  5.41520491,  4.52053551,  3.77380926])

In [11]:
Vt

array([[ 0.48778958,  0.40042938,  0.44644957,  0.57885076,  0.25948451],
       [ 0.71957583, -0.33906222, -0.35382194,  0.1154566 , -0.47825437],
       [-0.31584585, -0.27662657,  0.5771604 ,  0.29678185, -0.63444901],
       [-0.336005  ,  0.46354873, -0.55654981,  0.52435297, -0.29585383]])

In [12]:
# Step 3: Convert sigma (1D array) to a diagonal matrix
sigma = np.diag(sigma)

In [13]:
sigma

array([[11.4454654 ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  5.41520491,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  4.52053551,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  3.77380926]])

In [14]:
# Step 4: Reconstruct the matrix using SVD components
reconstructed_matrix = np.dot(np.dot(U, sigma), Vt)

In [15]:
reconstructed_matrix

array([[ 5.00000000e+00,  4.64398014e-15,  4.00000000e+00,
         3.00000000e+00,  1.26998339e-14],
       [ 4.00000000e+00,  3.00000000e+00, -1.03414793e-14,
         5.00000000e+00,  1.40120194e-14],
       [-8.28030058e-15,  4.00000000e+00,  5.00000000e+00,
         4.00000000e+00,  3.00000000e+00],
       [ 3.00000000e+00,  2.00000000e+00,  1.90475611e-15,
         1.15874650e-15,  4.00000000e+00]])

In [16]:
# Step 5: Create a DataFrame for the predicted matrix
predicted_df = pd.DataFrame(reconstructed_matrix, index=data.index, columns=data.columns)

In [21]:
data

Unnamed: 0,Course_1,Course_2,Course_3,Course_4,Course_5
Student_1,5.0,,4.0,3.0,
Student_2,4.0,3.0,,5.0,
Student_3,,4.0,5.0,4.0,3.0
Student_4,3.0,2.0,,,4.0


In [20]:
print("\nPredicted Ratings:")
predicted_df


Predicted Ratings:


Unnamed: 0,Course_1,Course_2,Course_3,Course_4,Course_5
Student_1,5.0,4.64398e-15,4.0,3.0,1.269983e-14
Student_2,4.0,3.0,-1.034148e-14,5.0,1.401202e-14
Student_3,-8.280301e-15,4.0,5.0,4.0,3.0
Student_4,3.0,2.0,1.904756e-15,1.158747e-15,4.0


In [19]:
# Step 6: Evaluate (if ground truth is available)
# Example: Calculate RMSE for known ratings
known_values = ~data.isna()
rmse = np.sqrt(mean_squared_error(data[known_values], predicted_df[known_values]))
print(f"\nRMSE on Known Values: {rmse:.4f}")

ValueError: Input contains NaN.