## Exercise 1: Exploratory Data Analysis

### **Objective:** 

Every data science project starts with the following steps:

1. Data Loading 
2. Data Cleaning
3. Data Preparation
4. Exploratory Data Analysis

In this exercise we load a dataset from a url, clean the data by assign feature names and check for missing values, preparing the data by creating new features and conduct exploratory data analysis by answering questions about the data.

### **Dataset:**

Auto MPG dataset from the UCI Machine Learning Repository. The dataset contains 398 samples with 8 features, and the
target variable is the fuel efficiency in miles per gallon. (https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data)

1. **Data Loading**
    
    1.1 Import the necessary libraries: pandas and numpy.
    
    1.2 Load the auto-mpg dataset from the given url.

2. **Data Cleaning**

    2.1 Assign the following column names to the dataset: ["mpg", "cylinders", "displacement", "horsepower", "weight","acceleration", "model_year", "origin", "car_name"]

    2.2 Get basic information about the data and check for missing values.

3. **Data Preparation**

    3.1 Check the data types and adjust if needed.
    
    3.2 Create a new feature called "bestsellers" that flags all cars with 8 cylinders and model 70 with 1, all cars with 6 cylinders and model year 80 with 2, and the rest with 0.

4. **Exploratory Data Analysis**

    4.1 What is the average *horse power* and minimum *weight* of the cars in this dataset?

    4.2 What is the maximum *mpg* per *model year* and *cylinder*? 

    4.3 How many cars weigh less than 3449 kg and have more than 5 cylinders?

#### **Hints/reminders:**

- pd.read_csv(url)
- data.columns = ["a", "b", … ] renames columns to ’a’, ’b’, etc.
- pd.to_numeric(data["a"] errors='coerce') converts columns a to numeric data or to NaN if conversion is not possible
- data = data.dropna() drops all rows that contain NaN
- remember np.mean() and np.abs()
- see np.where()
- check the pandas function df.groupby & df.apply


In [1]:
import pandas as pd

In [None]:
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pandas as pd


data = pd.read_csv("c:/data/iris.csv")

model = DecisionTreeClassifier(criterion='entropy')

X = data.drop('species',axis=1)
Y=data['species']

model.fit(X,Y)
res = model.predict(X)

fig = plt.figure(figsize=(30,10))
tree.plot_tree(model,
feature_names=X.columns.tolist(),
class_names=model.classes_.tolist(),
filled=True,fontsize=10)
text = tree.export_text(model)
print(text)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics

from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from xgboost import plot_tree
from sklearn import datasets

iris = datasets.load_iris()
iris

iris.feature_names

data = pd.DataFrame(iris.data, columns=iris.feature_names)
data

X = data.iloc[:, 0:3]
y = data.iloc[:, [3]]
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

model = XGBRegressor()
model.fit(Xtrain, Ytrain)
pred = model.predict(Xtest)
pred

plt.scatter(Ytest, pred)
plt.xlabel("actual")
plt.ylabel("predicted")

mae = metrics.mean_absolute_error(Ytest, pred)
print('MAE: ' + str(mae))


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt


# Load the data from a file
data = pd.read_csv('C:\\data\\churn_exercise.csv')

# Split the data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(data[['usage_hours', 'complaints']], data.customer_left, test_size=0.2, random_state=42)


# Create a Decision Tree Classifier model
dt_model = DecisionTreeClassifier(max_depth=3,criterion='entropy') #limiting tree size
#dt_model = DecisionTreeClassifier(criterion='entropy')

# Train the model on the training data
dt_model.fit(X_train, y_train)

# Visualize the decision tree
plt.figure(figsize=(20,10))
plot_tree(dt_model, filled=True, feature_names=X_train.columns.tolist(), class_names=['No Churn', 'Churn'])
plt.show()

# Make predictions on the testing data
y_pred = dt_model.predict(X_test)

# Compute the accuracy of the model
#accuracy = accuracy_score(y_test, y_pred)
accuracy=sum(y_pred==y_test)/len(y_test)
print('Accuracy:', accuracy)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(confusion_matrix)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error  # ← для оценки MAE

# Загрузка данных
data = pd.read_csv("houses.csv", header=None)
data.columns = ['living_space', 'size_of_property', 'price']

x = data.living_space
y = data.price

# Полиномиальная регрессия степени 5
degree = 5
model = np.poly1d(np.polyfit(x, y, degree))

# Предсказания
y_hat = model(x)

# Подготовка данных для построения кривой
print_data = np.linspace(0, np.max(x), 100)

# Визуализация
plt.figure()
plt.scatter(x, y)
plt.plot(print_data, model(print_data), c="black")
#plt.ylim(min(data.price) * 0.8, max(data.price) * 1.2)
plt.xlabel("Living space (m²)")
plt.ylabel("Price (€)")
plt.title(f"Polynomial Regression (Degree {degree})")
plt.grid(True)
plt.show()

# Оценка ошибки MAE
mae = mean_absolute_error(y, y_hat)
print(f'The MAE is {mae:.2f}')

# Предсказание цены для площади 280 м²
house280 = model(280)
print(f'The prediction for a house of size 280 is {house280:.2f}')


In [None]:
data = pd.read_csv('Fish.csv')

X = data.iloc[: , 2 : ]
Y = data.iloc[:, [1]]

x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size= 0.7, shuffle=True)

model = XGBRegressor()
model.fit(x_train, y_train)
pred = model.predict(x_test)

plt.scatter(y_test, pred)
plt.xlabel('actual')
plt.ylabel('predicted')
plt.show()

from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, pred)
print(f"MAE is: {mae}")

baseline = np.repeat(y_train.mean().values, len(y_test))

print('MAE comparing by baseline is: ' , mean_absolute_error(y_test, baseline))

In [None]:
import pandas as pd
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split


iris = datasets.load_iris()
iris.feature_names

data = pd.DataFrame(iris.data, columns=iris.feature_names)
X = data
iris.target_names
y = iris.target == 1

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.3, random_state=2, shuffle=True)

model = XGBClassifier()
model.fit(Xtrain,Ytrain)

pred = model.predict(Xtest)

from sklearn.metrics import confusion_matrix

print(confusion_matrix(Ytest, pred))

tn, fp, fn, tp = confusion_matrix(Ytest, pred).ravel()

pred_proba = model.predict_proba(Xtest)
pred_proba

fpr, tpr, thresholds = metrics.roc_curve(Ytest, pred_proba[:,1])

import matplotlib.pyplot as plt

plt.plot(fpr, tpr)
plt.show()

In [None]:
df = pd.read_csv('responses.csv', sep=',')

df_num = df.select_dtypes(exclude='object')

df_num = df_num.dropna()

X = df_num.drop(columns=['Age'])
y = df_num['Age'] >= 19

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

model = XGBClassifier()
model.fit(Xtrain, Ytrain)

pred = model.predict(Xtest)

from sklearn.metrics import confusion_matrix

print(confusion_matrix(Ytest, pred))

tn, fp, tp, fn = confusion_matrix(Ytest, pred).ravel()

pred_proba = model.predict_proba(Xtest)

fpr, tpr, thresholds = metrics.roc_curve(Ytest, pred_proba[:, 1])


plt.plot(fpr, tpr)
plt.show()
