# Preliminaries

In this lab, we will briefly walk through the datasets used throughout the class, and apply some basic data loading, visualization, and processing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use('seaborn-white')

# 1. Data loading

## 1.1. Read data with pandas



In [None]:
df = pd.read_csv('/content/drive/MyDrive/time-series-2022/Data/Advertising.csv', usecols=[1,2,3,4])
df.info()

In [None]:
df = pd.read_excel('/content/drive/MyDrive/time-series-2022/Data/Default.xlsx')
df.info()

### 2) Use openpyxl

In [None]:
ws = openpyxl.load_workbook('/content/drive/MyDrive/time-series-2022/Data/Default.xlsx')['Sheet1']
data = ws.values
data_cols = next(data)[0:]
df = pd.DataFrame(data, columns=data_cols)
df.info()

### 3) Load the Default2.xlsx file

In [None]:
df = pd.read_excel('/content/drive/MyDrive/time-series-2022/Data/Default2.xlsx', engine='openpyxl')
df.info()

## 1.2. Print the loaded data

In [None]:
df = pd.read_excel('drive/MyDrive/ml-practice-samsung-2022-spring/Notebooks/Data/Default.xlsx')
df.head(3)

In [None]:
df = pd.read_excel('/content/drive/MyDrive/time-series-2022/Data/Default.xlsx')
df.tail(5)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/time-series-2022/Data/Advertising.csv', usecols=[1,2,3,4])
df[['Radio', 'TV', 'Newspaper']].describe()

## 1.3. Select a few features from the data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/time-series-2022/Data/Hitters.csv')
df.head(3)

In [None]:
feature_names = ['Years', 'Hits']

X = df[feature_names]
X.head()

## 1.4. Add a new feature to the data

In [None]:
credit = pd.read_csv('/content/drive/MyDrive/time-series-2022/Data/Credit.csv', usecols=list(range(1,12)))
credit.head(3)

In [None]:
credit['Student2'] = credit.Student.map({'No':0, 'Yes':1})
credit.head(3)

## 1.5. Removing features from the data

In [None]:
df = pd.read_csv('drive/MyDrive/time-series-2022/Data/Heart.csv')
df.head(3)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/time-series-2022/Data/Heart.csv').drop('Unnamed: 0', axis=1).dropna()
df.head(3)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/time-series-2022/Data/Heart.csv').drop('ChestPain', axis=1).dropna()
df.head(3)

# 2. Visualization

## 2.1. Scatter plot 

In [None]:
advertising = pd.read_csv('/content/drive/MyDrive/time-series-2022/Data/Advertising.csv', usecols=[1,2,3,4])
advertising.info()

X = advertising.TV
y = advertising.Sales

plt.scatter(X, y,  color='black')

plt.show()

## 2.2. Histogram

In [None]:
df = pd.read_csv('/content/drive/MyDrive/time-series-2022/Data/Hitters.csv').dropna()
df.head(3)

# Prepare X, y
feature_names = ['Years', 'Hits']
X = df[feature_names].values
y = np.log(df['Salary'].values)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11,4))
ax1.hist(df['Salary'].values)
ax1.set_xlabel('Salary')
ax2.hist(y)
ax2.set_xlabel('Log (Salary)');
plt.show()

## 2.3. Overlaying multiple plots

In [None]:
df_wage = pd.read_csv('/content/drive/MyDrive/time-series-2022/Data/Wage.csv')
df_wage.head(3)

fig, ax = plt.subplots(1, 1, figsize=(5.5, 5))
ax.scatter(df_wage.age, df_wage.wage, facecolor='None', edgecolor='k', alpha=0.3)
sns.regplot(df_wage.age, df_wage.wage, order=1, truncate=True, scatter=False, ax=ax, ci=100)
ax.set_ylim(ymin=0)

# 3. Data processing

## 3.1. Normalizing the data

In [None]:
from sklearn.preprocessing import scale

advertising = pd.read_csv('/content/drive/MyDrive/time-series-2022/Data/Advertising.csv', usecols=[1,2,3,4])
advertising.info()

X = scale(advertising.TV, with_mean=True, with_std=False).reshape(-1,1)
y = advertising.Sales

plt.scatter(X, y,  color='black')

plt.show()

## 3.2. Factorizing the data

In [None]:
df = pd.read_excel('/content/drive/MyDrive/time-series-2022/Data/Default.xlsx')
df.head(3)

In [None]:
# factorize() returns two objects: a label array and an array with the unique values
df['default'], target_names = df['default'].factorize()
df['student'], _ = df['student'].factorize()

print('Target names:', target_names)
df.head(3)

# Exercise

1. Load "/content/drive/MyDrive/time-series-2022/Data/Auto.csv" and print the first and the last 5 rows of the dataset.
2. Print the dataset description using the info() function to find out how many data there is.
3. Select only the 'horsepower' and 'mpg' features.
4. Plot the scatter plot of the ('horsepower', 'mpg') from the loaded dataset.
5. Plot the histogram of the 'horsepower' values from the loaded dataset.

In [None]:
# Load /content/drive/MyDrive/time-series-2022/Data/Auto.csv



In [None]:
# Print the first 5 rows of the dataset



In [None]:
# Print the last 5 rows of the dataset.



In [None]:
# Print the dataset description using the info() function to find out how many data there is.



In [None]:
# Select only the 'horsepower' and 'mpg' features.



In [None]:
# Plot the scatter plot of the ('horsepower', 'mpg') from the loaded dataset.



In [None]:
# Plot the histogram of the 'horsepower' values from the loaded dataset.

