# 数据处理

## Series

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

from pandas import Series, DataFrame

In [None]:
s = Series(np.random.randn(10))
s

In [None]:
s[1:3],s[[1,2]],s[2]

In [None]:
s.index

In [None]:
s1 = s.copy()
s1.index = ["item 0", "item 1", "item 2", "item 3", "item 4", "item 5", "item 6", "item 7", "item 8", "item 9"]
s1

In [None]:
s2 = Series(s.values[::-1], index=["item 0", "item 1", "item 2", "item 3", "item 4", "item 5", "item 6", "item 7", "item 8", "item 9"])
s2

In [None]:
s1 + s2

In [None]:
s3 = pd.Series(["d", "e"])
s4 = pd.Series(["f", "g"])
s5 = pd.concat([s2, s3])
s5

In [None]:
s5[1]

### ❓Q1. 我们如何将 `s3` 和 `s4` 和合并在一起，同时保证 `index` 是递增的呢？

In [None]:
s4.index = ['2', '3']
s6 = pd.concat([s3, s4])
s6

## Dataframe

In [None]:
df1 = DataFrame([[1,2,3,4,5], [6,7,8,9,10]], columns=["a", "b", "c", "d", "e"])
df1

In [None]:
gplay = pd.read_csv("googleplaystore.csv")
gplay

### ❓Q2. 试去除 `gplay` 中的 `Nan` 数据

In [None]:
gplay.dropna(inplace=True)
gplay

### ❓Q3. 查看`gplay`中重复的列，试使用`iloc`和`loc`来选中重复的行，并谈谈他们的区别

In [None]:
duplicated_apps = gplay.loc[gplay.duplicated()]

print(gplay.duplicated())

In [None]:
print(len(duplicated_apps))

In [None]:
duplicated_apps

In [None]:
#duplicated_apps.loc[0]  报错,因为没有标签为‘0’的行

In [None]:
first_dup_app_iloc = duplicated_apps.iloc[0]
first_dup_app_iloc

#### ''' Your Understanding Here '''

In [None]:
#loc根据索引查找，iloc根据序号查找

In [None]:
print(gplay.shape)
print(gplay.drop_duplicates().shape)

## 读取数据

In [None]:
df = pd.read_csv("Auto.csv")
df

In [None]:
df.dtypes

In [None]:
df = df.replace('?', pd.NA)  
  
df['origin'] = df['origin'].astype('object')  
df['year'] = df['year'].astype('object')  
  
print(df.dtypes)

In [None]:
df.isna().any(axis=1)

In [None]:
df[df.isna().any(axis=1)]

### ❓Q5. 将 `df` 中的所有含有 `Nan` 的行全部移除

In [None]:
df = df[~df.isnull().any(axis=1)]
df

In [None]:
X = df["horsepower"]
y = df["mpg"]
X = sm.add_constant(X)
model1 = sm.OLS(y,X.astype(float)).fit()

#### ❓Q6. 了解 `matplotlib`，试对上述的代码的预测结果进行可视化处理

In [None]:
plt.scatter(df["horsepower"], df["mpg"], label='Actual Data')  
X = X.astype(float)
predicted_y = model1.predict(X)  

plt.plot(df["horsepower"], predicted_y, color='red', label='Predicted Data')  
  
plt.xlabel('Horsepower')  
plt.ylabel('mpg')    

plt.legend()
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("email_spam.csv")
df

#### ❓Q7. 现在我们将考虑对垃圾邮件进行分类。但在生成模型之前，首先将所有列数据转换为数字。具体地说:
- `no` 修改为 `0`，`yes` 修改为 `1`
- 为 `format` 和 `number` 使用虚拟变量编码(你可以使用 `pd.get_dummies()` 来生成虚拟变量)

In [None]:
df = df.replace({'no': 0, 'yes': 1})  

df = pd.get_dummies(df, columns=['format', 'number'])  
 
X = df.iloc[:,1:] 
y = df["spam"]  
df

#### ❓Q8. 使用 `train_test_split()` 将数据集划分为`70%`的训练集和`30%`的测试集(设置 `random_test=123` 以确保我们可以复制分割)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

#### ❓Q9. 使用逻辑回归的模型来进行预测，同时看看你的准确率如何

In [None]:
model = LogisticRegression(max_iter=100000)  
   
model.fit(X_train, y_train)  
  
y_pred = model.predict(X_test)  
  
y_pred_score = accuracy_score(y_test, y_pred)  
  
print("Test Accuracy: ", y_pred_score)

#### ❓Q10. 由于数据是不平衡的，最好生成混淆矩阵。了解什么是混淆矩阵以及如何在sklearn上实现它

In [None]:
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred)).plot()