In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
rc={'figure.dpi':600,'font.sans-serif':'SimHei','axes.unicode_minus':False}
sns.set(context='notebook', style='whitegrid', rc=rc)

In [None]:
from pet.datasets import factory
students = factory.load_data('研究生初试成绩')

In [None]:
students.shape

In [None]:
students.head(2)

In [None]:
students.describe()

In [None]:
students['id']=students['id'].astype(str)
students['性别']=students['性别'].astype("category")
students['毕业年月'] = pd.to_datetime(students['毕业年月'],format='%Y%m')
students['英语分类']=students['英语分类'].astype("category")
students['数学分类']=students['数学分类'].astype("category")
students['专业课分类']=students['专业课分类'].astype("category")

In [None]:
import datetime
students['复习时间']=datetime.datetime(2019,7,1)-students['毕业年月']
students['复习时间']=students['复习时间'].dt.days
students.to_excel('st1.xlsx',index=None)
students.head(2)

In [None]:
students.dtypes

In [None]:
order = ['id', '性别', '本科院校', '本科专业', '毕业年月','复习时间', '报考代码', '英语分类', '英语成绩', '数学分类', '数学成绩', '政治成绩', '专业课分类', '专业课成绩',
       '报考院校', '报考专业', '总分' ]
students = students[order]

In [None]:
students

可视化数据分布

In [None]:
sns.displot(students, x="英语成绩")

In [None]:
sns.displot(students, x="总分",col='数学分类')

In [None]:
sns.displot(students, x="总分",col='数学分类',binwidth=4)

In [None]:
sns.displot(students, x="总分",col='数学分类',kind='kde')

In [None]:
sns.displot(students, x="总分",col='数学分类',kind='kde',bw_adjust=.2)
sns.displot(students, x="总分",col='数学分类',kind='kde',bw_adjust=2)

In [None]:
sns.kdeplot(students['总分'], bw_method=.5, label="bw: 0.5")
sns.kdeplot(students['总分'], bw_method=1, label="bw: 1")
sns.kdeplot(students['总分'], bw_method=2, label="bw: 2")
plt.legend()


In [None]:
sns.displot(data=students, x="英语成绩",binwidth=1,kde=True,col='性别')

In [None]:
sns.displot(data=students, x="英语成绩",col='性别')

In [None]:
sns.displot(data=students, x="英语成绩",kind='kde',col='性别')

In [None]:
sns.displot(data=students, x="英语成绩",kde=True,col='性别')

In [None]:
sns.displot(students, x="总分", kind="ecdf",col='性别')

In [None]:
sns.displot(students, x="数学成绩", hue="数学分类", kind="ecdf",col='性别')

In [None]:
sns.displot(data=students, x="数学成绩", y="总分",col='性别')

In [None]:
sns.displot(data=students, x="数学成绩", y="总分",col='性别',kind='kde')

In [None]:
sns.displot(students, x="数学成绩", y="总分", hue="数学分类", kind="kde",col='性别')

In [None]:
sns.jointplot(data=students, x="数学成绩", y="英语成绩")

In [None]:
sns.jointplot(data=students, x="数学成绩", y="英语成绩",kind='kde')

In [None]:
sns.relplot(data=students,x='数学成绩',y='英语成绩',col='性别')

In [None]:
sns.relplot(data=students, x="数学成绩", y="英语成绩", hue='数学分类',style="英语分类",col='性别')

In [None]:
sns.relplot(data=students, x="数学成绩", y="英语成绩", hue='数学分类',style="英语分类",col='性别',size='英语成绩')

In [None]:
fig=sns.relplot(data=students,x='数学成绩',y='总分',kind='line',col='性别')
fig

In [None]:
sns.relplot(data=students,x='英语成绩',y='总分',kind='line',col='性别',estimator=None)

In [None]:
sns.relplot( data=students, kind="line", x="英语成绩", y="数学成绩", hue="性别", style="数学分类", dashes=False, markers=True,col='性别')


In [None]:
fig.savefig('rel.png')

In [None]:
sns.relplot(
    data=students, kind="line",
    x="总分", y="专业课成绩", hue="性别",
    col="数学分类", row="性别", height=4,
    estimator=None)


In [None]:
sns.jointplot(x="数学成绩", y="总分",data=students, kind="hex")

In [None]:
sns.jointplot(x="数学成绩", y="总分", data=students,kind="reg")

In [None]:
sns.jointplot(x="英语成绩", y="数学成绩", data=students, kind="kde")

In [None]:
sns.jointplot(x="政治成绩", y="总分", data=students, kind="kde")

In [None]:
sns.pairplot(students)

In [None]:
sns.pairplot(students, hue="性别")

In [None]:
sns.pairplot(students, x_vars=["数学成绩", "英语成绩",'政治成绩'], y_vars=["总分"], height=5, aspect=.8)

In [None]:
sns.catplot(x="性别", y="总分", data=students,col='数学分类')

In [None]:


plot = students['报考专业'].value_counts().plot.pie( figsize=(5, 5))

In [None]:
sns.catplot(x="性别", y="总分", data=students,col='数学分类',jitter=False)

In [None]:
sns.catplot(y="数学分类", x="总分",kind="swarm", data=students,col='性别')

In [None]:
sns.catplot(data=students, x="报考专业", y="总分", hue="性别", kind="swarm")
plt.tick_params(axis='x',labelsize=8,rotation=45)

In [None]:
sns.catplot(x="性别", y="英语成绩", kind="box", data=students,col='数学分类')

In [None]:
sns.catplot(x="性别", y="英语成绩", kind="box", hue="英语分类", data=students, order=['男','女'],col='数学分类');

In [None]:
sns.catplot(x="性别", y="英语成绩", kind="boxen", hue="英语分类", data=students, order=['男','女'],col='数学分类');

In [None]:
sns.catplot(x="数学分类", y="总分", hue="性别",kind="violin", data=students,col='英语分类')

In [None]:
sns.catplot(x="性别", y="总分", hue="数学分类", kind="bar", data=students,col='英语分类')

In [None]:
sns.catplot(x="报考专业", y="总分", hue="性别", kind="point", data=students);
plt.tick_params(axis='x',labelsize=8,rotation=90)



In [None]:
sns.catplot(x="数学分类", hue="性别", kind="count", data=students,col='英语分类') 

In [None]:
sns.catplot(x="报考专业", y="总分", kind='point',data=students)
plt.tick_params(axis='x',labelsize=8,rotation=90)

In [None]:
sns.pointplot(x="报考专业", y="总分", data=students)
plt.tick_params(axis='x',labelsize=8,rotation=90)

In [None]:
sns.catplot(data=students, x="复习时间", y="英语成绩",kind="swarm",col='性别')

In [None]:
sns.regplot(x="数学成绩", y="总分", data=students)

In [None]:
sns.lmplot(x="数学成绩", y="总分", data=students,col='性别')

In [None]:
sns.pairplot(students, x_vars=["数学成绩", "英语成绩",'政治成绩'], y_vars=["总分"], height=5, aspect=.8, kind="reg");

In [None]:
sns.lmplot(x="数学成绩", y="总分", data=students,x_estimator=np.mean,col='性别');

In [None]:
sns.catplot(data=students, x="性别", y="总分",kind="box",col='英语分类')

In [None]:
sns.lmplot(x="数学成绩", y="总分", hue="性别", markers=["o", "x"],col="英语分类", data=students);

In [None]:
sns.lmplot(x="数学成绩", y="总分", hue="性别", markers=["o", "x"],col="英语分类", palette="Set2",height=4,data=students);

In [None]:
sns.lmplot(data=students,x='数学成绩',y='总分',order=4,col='性别')

In [None]:
sns.lmplot(data=students,x='专业课成绩',y='总分',order=4)

In [None]:
sns.displot(students, x="英语成绩",binwidth=1,bins=50)

In [None]:
sns.displot(students, x="英语成绩", hue="英语分类")

In [None]:
sns.displot(students, x="英语成绩", hue="英语分类",element="step")

In [None]:
sns.displot(students, x="英语成绩", hue="英语分类", multiple="stack")

In [None]:
sns.displot(students, x="英语成绩", hue="英语分类",multiple="dodge")

In [None]:
sns.displot(students, x="英语成绩", hue="英语分类",multiple="dodge",col="性别")

In [None]:
sns.displot(students, x="数学成绩", hue="数学分类", stat="density")

In [None]:
sns.displot(students, x="数学成绩", hue="数学分类", stat="probability")

In [None]:
sns.displot(students, x="数学成绩", kind="kde",col='性别')

In [None]:
sns.displot(students, x="数学成绩", kind="kde",col='性别',bw_adjust=.1)

In [None]:
sns.displot(students, x="数学成绩", kind="kde",col='性别',bw_adjust=2)

In [None]:
sns.displot(students, x="数学成绩", hue='数学分类',kind="kde",col='性别',bw_adjust=.25)

In [None]:
sns.displot(students, x="数学成绩", hue='数学分类',kind="kde",col='性别',bw_adjust=.25,multiple="stack")

In [None]:
sns.displot(students, x="数学成绩", hue='数学分类',kde=True,col='性别',multiple="stack")

In [None]:
sns.displot(students, x="数学成绩", hue='数学分类',col='性别',kind="ecdf")

In [None]:
sns.displot(students, x="数学成绩", y="英语成绩")

In [None]:
sns.displot(students, x="数学成绩", y="英语成绩",kind='kde',col='性别')

In [None]:
sns.pairplot(students)

In [None]:
sns.displot(data=students,x='数学成绩',binwidth=3)

In [None]:
sns.displot(data=students,x='数学成绩',binwidth=13)