In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')
plt.rcParams['font.size'] = 10

In [None]:
# Load data
df = pd.read_csv("GroupProjectDataSet.csv", sep=',')
print('Shape of data frame:', df.shape)
df.head(10)

In [None]:
# Filling missing values for variables where appropriate

df["PoolQC"] = df["PoolQC"].fillna(value = "None")
df["MiscFeature"] = df["MiscFeature"].fillna(value = "None")
df["Alley"] = df["Alley"].fillna(value = "None")
df["Fence"] = df["Fence"].fillna(value = "None")
df["FireplaceQu"] = df["FireplaceQu"].fillna(value = "None")
df["GarageCond"] = df["GarageCond"].fillna(value = "None")
df["GarageType"] = df["GarageType"].fillna(value = "None")
df["GarageFinish"] = df["GarageFinish"].fillna(value = "None")
df["GarageQual"] = df["GarageQual"].fillna(value = "None")
df["BsmtFinType2"] = df["BsmtFinType2"].fillna(value = "None")
df["BsmtExposure"] = df["BsmtExposure"].fillna(value = "None")
df["BsmtQual"] = df["BsmtQual"].fillna(value = "None")
df["BsmtCond"] = df["BsmtCond"].fillna(value = "None")
df["BsmtFinType1"] = df["BsmtFinType1"].fillna(value = "None")

In [None]:
missing = df.isnull().sum().sort_values(ascending=False)
missing = missing[missing > 0]
missing.plot.bar()

In [None]:
# Percentage of missing values for the variables

percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([missing, percent], axis=1, keys=['Nr. of missing values', 'Share'])
missing_data.head(5)

In [None]:
#####Dealing with Categorical Features (Encoding Categorical Variables) / Splitting Into X and y

# Numerical variables that should be handled as categorical variables
df = df.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 
50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 
80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 
150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"}})
df = df.replace({"MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}})

In [None]:
# Assign response to y
y = df.iloc[:, -1]

# Factorize categorical values, assign output to X
# create (multiple) dummy variables for a categorical variable
# panda way
X = pd.get_dummies(df.iloc[:, :-1])
X.head()

In [None]:
## Feature Selection##
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Compute the correlation matrix
corr = df.corr()

# Plotting Heatmap
plt.figure(figsize = (10,6))
sns.heatmap(corr, annot = True)

In [None]:
# Compute the correlation matrix
corr_matrix = df.corr()

# Sort the correlations with respect to 'Class'
corr_with_class = corr_matrix['Class'].sort_values(ascending=False)

# Print the correlations
print(corr_with_class)

# Select the top 10 features with the highest correlation
top_features = corr_with_class.nlargest(10).index

# Print the top features
print(top_features)



In [None]:
####Partitioning of the Data Set Into Train and Test Set


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=0, 
                                                    stratify=y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

train_df = pd.DataFrame(data=X_train, columns=X.columns)
train_df['Class'] = y_train

test_df = pd.DataFrame(data=X_test, columns=X.columns)
test_df['Class'] = y_test

X_train = train_df[['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea']]
y_train = train_df['Class']

# Initialize the KNN classifier with k=20
knn = KNeighborsClassifier(n_neighbors=20)

# Fit the KNN model on the training data
knn.fit(X_train, y_train)

# Use the trained KNN model to make predictions on the test data
X_test = test_df[['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea']]
y_test = test_df['Class']
y_pred = knn.predict(X_test)

# Compute the accuracy of the KNN model on the test data
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Define a range of values for k
k_range = range(1, 81)

# Create an empty list to store the cross-validation scores
cv_scores = []

# Perform k-fold cross-validation for each value of k
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(np.mean(scores))

# Plot the cross-validation scores as a function of k
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(k_range, cv_scores)
plt.xlabel('Value of k for KNN')
plt.ylabel('Cross-Validation Accuracy')
plt.show()

# Select the best value of k
best_k = np.argmax(cv_scores) + 1
print("The best value of k is:", best_k)


In [None]:
#####LDA#####

In [None]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Feature-Matrix X und Zielvariable Y auswählen
X = df[['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea',
       'TotalBsmtSF', '1stFlrSF', 'TotRmsAbvGrd', 'FullBath', 'YearRemodAdd']]
Y = df['Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# LDA-Modell erstellen und anpassen
lda = LDA(solver='lsqr')
lda.fit(X_train, y_train)

# Vorhersagen auf Testdaten machen
y_pred = lda.predict(X_test)

# Leistungsmaße berechnen
print('default-rate: {0: .4f}'.format(np.sum(y_test)/len(y_test)))
print('score:        {0: .4f}'.format(lda.score(X_test, y_test)))
print('error-rate:   {0: .4f}'.format(1-lda.score(X_test, y_test)))

# Konfusionsmatrix ausgeben
print(metrics.confusion_matrix(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt

# Erstellen von Scatterplots
fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(12, 12))
axs = axs.flatten()

# Entfernen Sie 'Class' aus den Features
features = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea',            'TotalBsmtSF', '1stFlrSF', 'TotRmsAbvGrd', 'FullBath', 'YearRemodAdd']
X_train_no_class = X_train[features]

for i, feature in enumerate(features):
    axs[i].scatter(X_train_no_class[feature], y_train, alpha=0.5)
    axs[i].set_xlabel(feature)
    axs[i].set_ylabel('Class')

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Define features and target variable
features = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'TotRmsAbvGrd', 'FullBath', 'YearRemodAdd']
target = 'Class'

# Create histograms for each feature
for feature in features:
    plt.hist(df[feature], bins='auto')
    plt.title(feature)
    plt.show()

# Create histogram for target variable
plt.hist(df[target], bins='auto')
plt.title(target)
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Define features and target variable
X = df[['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'TotRmsAbvGrd', 'FullBath', 'YearRemodAdd']]
y = df['Class']

# Create LDA object and run classifier
lda = LDA(solver='lsqr')
lda.fit(X, y)

# Predict probabilities for each class
y_prob = lda.predict_proba(X)

# Compute ROC curve and ROC area for each class
fpr = {}
tpr = {}
roc_auc = {}
for i in range(len(set(y))):
    fpr[i], tpr[i], _ = roc_curve(y, y_prob[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure()
colors = ['red', 'green', 'blue', 'orange', 'purple']
for i, color in zip(range(len(set(y))), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
####QDA

import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Feature-Matrix X und Zielvariable Y auswählen
X = df[['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea',
       'TotalBsmtSF', '1stFlrSF', 'TotRmsAbvGrd', 'FullBath', 'YearRemodAdd']]
Y = df['Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# QDA-Modell erstellen und anpassen
qda = QDA()
qda.fit(X_train, y_train)

# Vorhersagen auf Testdaten machen
y_pred = qda.predict(X_test)

# Leistungsmaße berechnen
print('default-rate: {0: .4f}'.format(np.sum(y_test)/len(y_test)))
print('score:        {0: .4f}'.format(qda.score(X_test, y_test)))
print('error-rate:   {0: .4f}'.format(1-qda.score(X_test, y_test)))

# Konfusionsmatrix ausgeben
print(metrics.confusion_matrix(y_test, y_pred))

