# Exploratory Data Analysis with Pandas Advanced
#####################################################################################

This Jupyter Notebook is designed for the CI7340 Applied Data Programming
 Module for the MSc. Data Science degree programm at Kingston University.

Parts of this module is borrowed from the official [Pandas](https://pandas.pydata.org/) website.

Copyright@ *Nabajeet Barman*, Kingston University, London, UK

#####################################################################################

## Topics Covered:

>* Renaming Columns
>* Getting data in/out
>* Descriptive Statistics
>* Handling Missing Data
>* Reshaping and Pivot Tables
>* Merge, Concatenate and Groupby

A Cheatsheet summarizing the important aspects can be found [here](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf)

Let us first import the required libraries and redefine the Employee Records dataframe used in previous lecture

In [None]:
import numpy as np
import pandas as pd

In [None]:
# declare the employee_records dataframe
# the column names here are intentional as we will use commands to modify them later!!!
employee_records = pd.DataFrame({
        'EmployeeName': ['Sam', 'Max', 'Tony', 'Sarah', 'Tania', 'David', 'Mark','Alice', 'Charles', 'Bob', 'Anna'],
        'EmployeeDept': ['Research','HR','Marketing','Sales', 'Finance', 'IT', 'HR', 'Marketing', 'IT', 'Finance','Sales'],
        'EmployeeId' : [10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10009, 10010, 10011],
        'Salary'     : [45034.88, 65343.45, 53423.27, 76422.34, 58753.00, 34323.44, 66544.60, 34354.66, 55234.96, 39078.60, 44567.88]
    })
employee_records

Unnamed: 0,EmployeeName,EmployeeDept,EmployeeId,Salary
0,Sam,Research,10001,45034.88
1,Max,HR,10002,65343.45
2,Tony,Marketing,10003,53423.27
3,Sarah,Sales,10004,76422.34
4,Tania,Finance,10005,58753.0
5,David,IT,10006,34323.44
6,Mark,HR,10007,66544.6
7,Alice,Marketing,10008,34354.66
8,Charles,IT,10009,55234.96
9,Bob,Finance,10010,39078.6


# Renaming Columns

In [None]:
df = employee_records.copy()
df.rename(str.lower, axis='columns')

Unnamed: 0,employeename,employeedept,employeeid,salary
0,Sam,Research,10001,45034.88
1,Max,HR,10002,65343.45
2,Tony,Marketing,10003,53423.27
3,Sarah,Sales,10004,76422.34
4,Tania,Finance,10005,58753.0
5,David,IT,10006,34323.44
6,Mark,HR,10007,66544.6
7,Alice,Marketing,10008,34354.66
8,Charles,IT,10009,55234.96
9,Bob,Finance,10010,39078.6


In [None]:
df['EmployeeDept'].rename('employee_dept',inplace=True)

0      Research
1            HR
2     Marketing
3         Sales
4       Finance
5            IT
6            HR
7     Marketing
8            IT
9       Finance
10        Sales
Name: employee_dept, dtype: object

In [None]:
df

Unnamed: 0,EmployeeName,EmployeeDept,EmployeeId,Salary
0,Sam,Research,10001,45034.88
1,Max,HR,10002,65343.45
2,Tony,Marketing,10003,53423.27
3,Sarah,Sales,10004,76422.34
4,Tania,Finance,10005,58753.0
5,David,IT,10006,34323.44
6,Mark,HR,10007,66544.6
7,Alice,Marketing,10008,34354.66
8,Charles,IT,10009,55234.96
9,Bob,Finance,10010,39078.6


In [None]:
df.rename(columns={"EmployeeName":"employee_name","EmployeeDept":"employee_dept","EmployeeId":"employee_id","Salary":"salary"},inplace=True)

In [None]:
df

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,Sam,Research,10001,45034.88
1,Max,HR,10002,65343.45
2,Tony,Marketing,10003,53423.27
3,Sarah,Sales,10004,76422.34
4,Tania,Finance,10005,58753.0
5,David,IT,10006,34323.44
6,Mark,HR,10007,66544.6
7,Alice,Marketing,10008,34354.66
8,Charles,IT,10009,55234.96
9,Bob,Finance,10010,39078.6


In [None]:
# rows, columns
df.shape

(11, 4)

Our dataset consists of 11 rows and four columns

In [None]:
# information about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   employee_name  11 non-null     object 
 1   employee_dept  11 non-null     object 
 2   employee_id    11 non-null     int64  
 3   salary         11 non-null     float64
dtypes: float64(1), int64(1), object(2)
memory usage: 480.0+ bytes


In [None]:
# number of non NA values
df.count()

employee_name    11
employee_dept    11
employee_id      11
salary           11
dtype: int64

In [None]:
# assigning ranks to entries
df.rank()
# Qn: Where can you use this information?

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,8.0,9.0,1.0,5.0
1,7.0,3.5,2.0,9.0
2,11.0,7.5,3.0,6.0
3,9.0,10.5,4.0,11.0
4,10.0,1.5,5.0,8.0
5,5.0,5.5,6.0,1.0
6,6.0,3.5,7.0,10.0
7,1.0,7.5,8.0,2.0
8,4.0,5.5,9.0,7.0
9,3.0,1.5,10.0,3.0


In [None]:
df

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,Sam,Research,10001,45034.88
1,Max,HR,10002,65343.45
2,Tony,Marketing,10003,53423.27
3,Sarah,Sales,10004,76422.34
4,Tania,Finance,10005,58753.0
5,David,IT,10006,34323.44
6,Mark,HR,10007,66544.6
7,Alice,Marketing,10008,34354.66
8,Charles,IT,10009,55234.96
9,Bob,Finance,10010,39078.6


# Rearraniging columns

## Using columns names

In [None]:
# mention column names explicitly
df1 = df[["employee_id","employee_name","employee_dept","salary"]]
df1
# TODO - try this with inplace for the original DF

Unnamed: 0,employee_id,employee_name,employee_dept,salary
0,10001,Sam,Research,45034.88
1,10002,Max,HR,65343.45
2,10003,Tony,Marketing,53423.27
3,10004,Sarah,Sales,76422.34
4,10005,Tania,Finance,58753.0
5,10006,David,IT,34323.44
6,10007,Mark,HR,66544.6
7,10008,Alice,Marketing,34354.66
8,10009,Charles,IT,55234.96
9,10010,Bob,Finance,39078.6


In [None]:
column_names = ["employee_id","employee_name","employee_dept","salary"]
df2 = df.reindex(columns=column_names)
df2

Unnamed: 0,employee_id,employee_name,employee_dept,salary
0,10001,Sam,Research,45034.88
1,10002,Max,HR,65343.45
2,10003,Tony,Marketing,53423.27
3,10004,Sarah,Sales,76422.34
4,10005,Tania,Finance,58753.0
5,10006,David,IT,34323.44
6,10007,Mark,HR,66544.6
7,10008,Alice,Marketing,34354.66
8,10009,Charles,IT,55234.96
9,10010,Bob,Finance,39078.6


## Using a list of column names

In [None]:
cols = df.columns.tolist()
cols

['employee_name', 'employee_dept', 'employee_id', 'salary']

In [None]:
cols = cols[-1:] + cols[1:-1] + cols[0:1]
cols

['salary', 'employee_dept', 'employee_id', 'employee_name']

In [None]:
df3 = df.reindex(columns=cols)
df3

Unnamed: 0,salary,employee_dept,employee_id,employee_name
0,45034.88,Research,10001,Sam
1,65343.45,HR,10002,Max
2,53423.27,Marketing,10003,Tony
3,76422.34,Sales,10004,Sarah
4,58753.0,Finance,10005,Tania
5,34323.44,IT,10006,David
6,66544.6,HR,10007,Mark
7,34354.66,Marketing,10008,Alice
8,55234.96,IT,10009,Charles
9,39078.6,Finance,10010,Bob


## Getting data in and out

* Reading/writing from/to a .csv file
* Reading/writing from/to an Excel file

## Reading/writing from/to a .csv file





In [None]:
employee_records

Unnamed: 0,EmployeeName,EmployeeDept,EmployeeId,Salary
0,Sam,Research,10001,45034.88
1,Max,HR,10002,65343.45
2,Tony,Marketing,10003,53423.27
3,Sarah,Sales,10004,76422.34
4,Tania,Finance,10005,58753.0
5,David,IT,10006,34323.44
6,Mark,HR,10007,66544.6
7,Alice,Marketing,10008,34354.66
8,Charles,IT,10009,55234.96
9,Bob,Finance,10010,39078.6


In [None]:
# write df to a csv file
employee_records.to_csv('sample.csv')

In [None]:
# bash command for listing files in the pwd/cwd ls = list
#!ls

sample.csv  sample_data


In [None]:
!cat sample.csv | head -5

,EmployeeName,EmployeeDept,EmployeeId,Salary
0,Sam,Research,10001,45034.88
1,Max,HR,10002,65343.45
2,Tony,Marketing,10003,53423.27
3,Sarah,Sales,10004,76422.34


In [None]:
sample_df = pd.read_csv('sample.csv')
sample_df

Unnamed: 0.1,Unnamed: 0,EmployeeName,EmployeeDept,EmployeeId,Salary
0,0,Sam,Research,10001,45034.88
1,1,Max,HR,10002,65343.45
2,2,Tony,Marketing,10003,53423.27
3,3,Sarah,Sales,10004,76422.34
4,4,Tania,Finance,10005,58753.0
5,5,David,IT,10006,34323.44
6,6,Mark,HR,10007,66544.6
7,7,Alice,Marketing,10008,34354.66
8,8,Charles,IT,10009,55234.96
9,9,Bob,Finance,10010,39078.6


In [None]:
sample_df = pd.read_csv('sample.csv',index_col=0)
sample_df

Unnamed: 0,EmployeeName,EmployeeDept,EmployeeId,Salary
0,Sam,Research,10001,45034.88
1,Max,HR,10002,65343.45
2,Tony,Marketing,10003,53423.27
3,Sarah,Sales,10004,76422.34
4,Tania,Finance,10005,58753.0
5,David,IT,10006,34323.44
6,Mark,HR,10007,66544.6
7,Alice,Marketing,10008,34354.66
8,Charles,IT,10009,55234.96
9,Bob,Finance,10010,39078.6


In [None]:
employee_records.to_csv('sample_without_index.csv',index=False)

In [None]:
!ls

sample.csv  sample_data  sample_without_index.csv


In [None]:
!cat sample_without_index.csv | head -5

EmployeeName,EmployeeDept,EmployeeId,Salary
Sam,Research,10001,45034.88
Max,HR,10002,65343.45
Tony,Marketing,10003,53423.27
Sarah,Sales,10004,76422.34


In [None]:
sample_df_without_index = pd.read_csv('sample_without_index.csv')
sample_df_without_index

Unnamed: 0,EmployeeName,EmployeeDept,EmployeeId,Salary
0,Sam,Research,10001,45034.88
1,Max,HR,10002,65343.45
2,Tony,Marketing,10003,53423.27
3,Sarah,Sales,10004,76422.34
4,Tania,Finance,10005,58753.0
5,David,IT,10006,34323.44
6,Mark,HR,10007,66544.6
7,Alice,Marketing,10008,34354.66
8,Charles,IT,10009,55234.96
9,Bob,Finance,10010,39078.6


## Reading/writing from/to a Excel file


In [None]:
employee_records.to_excel('sample_excel.xlsx', sheet_name="My_EmployeeRecords")

In [None]:
!ls

sample.csv  sample_data  sample_excel.xlsx  sample_without_index.csv


In [None]:
df_excel = pd.read_excel('sample_excel.xlsx')
df_excel

Unnamed: 0.1,Unnamed: 0,EmployeeName,EmployeeDept,EmployeeId,Salary
0,0,Sam,Research,10001,45034.88
1,1,Max,HR,10002,65343.45
2,2,Tony,Marketing,10003,53423.27
3,3,Sarah,Sales,10004,76422.34
4,4,Tania,Finance,10005,58753.0
5,5,David,IT,10006,34323.44
6,6,Mark,HR,10007,66544.6
7,7,Alice,Marketing,10008,34354.66
8,8,Charles,IT,10009,55234.96
9,9,Bob,Finance,10010,39078.6


# New section

## Reading from a .txt file

[From Matplotlib DIY assignment](https://colab.research.google.com/drive/11DokhBkrlQ6bmJr3tfe_MwpvFbepKx8u#scrollTo=dOxFBUX-CD90)

You are provided with the a dataset which is a subset of a large study about the gaming habits of over 200 participants consisting of students from Technical University of Berlin, Germany. Age (age_vals), gender (gender_vals), Gaming Expertise (gaming_expertise) and type of screen they usually use while playing games (screen_vals).

Download the data from [here](https://drive.google.com/file/d/13fkPL7c9CQGSVr-cLIqiw3r3_qinDg2W/view?usp=sharing)


In [None]:
df_tub = pd.read_csv('data_tub.txt')
df_tub

In [None]:
# we will use the delimiter to read it properly
df = pd.read_csv('data_tub.txt',delimiter=';')
df.head()

In [None]:
df = pd.read_csv('data_tub.txt',delimiter=';',names=["age", "gender", "gaming_expertise", "screen_type"])
df.head()

In [None]:
df.to_csv('data_tub_rewritten.txt',index=False,sep='\t') # tab separated

# Descriptive Statistics

![Descriptive Stats ](https://drive.google.com/uc?export=view&id=1eyBbDVq_PNL_6wmFLdI6NGWNpF7SSENO)

For more information, please refer [here](https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#descriptive-statistics)


Most of these are aggregations (hence producing a lower-dimensional result) like `sum(), mean(), and quantile()`, but some of them, like `cumsum() and cumprod()`, produce an object of the same size. Generally speaking, these methods take an axis argument, just like `ndarray.{sum, std, …}`, but the axis can be specified by name or integer:

>* Series: no axis argument needed
>* DataFrame: “index” (axis=0, default), “columns” (axis=1)

In [None]:
df = pd.DataFrame({
  'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
  'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
  'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

Unnamed: 0,one,two,three
a,-0.644804,0.608582,
b,0.352068,0.91442,-1.765565
c,0.799377,0.576662,-0.316179
d,,-0.242452,0.075116


In [None]:
df.mean()

one      0.168880
two      0.464303
three   -0.668876
dtype: float64

In [None]:
df.mean(0)

one      0.168880
two      0.464303
three   -0.668876
dtype: float64

In [None]:
df.mean(1)

a   -0.018111
b   -0.166359
c    0.353287
d   -0.083668
dtype: float64

In [None]:
df.mean(skipna=False)

one           NaN
two      0.464303
three         NaN
dtype: float64

In [None]:
df

Unnamed: 0,one,two,three
a,-0.644804,0.608582,
b,0.352068,0.91442,-1.765565
c,0.799377,0.576662,-0.316179
d,,-0.242452,0.075116


In [None]:
df.mean(1,skipna=False)

a         NaN
b   -0.166359
c    0.353287
d         NaN
dtype: float64

In [None]:
df.sum()

one      0.506641
two      1.857212
three   -2.006628
dtype: float64

In [None]:
df.sum(skipna=False)

one           NaN
two      1.857212
three         NaN
dtype: float64

### Using numpy

In [None]:
np.mean(df)

one      0.168880
two      0.464303
three   -0.668876
dtype: float64

In [None]:
np.mean(df['one'])

0.16888030648838834

In [None]:
# saving the output to numpy array
np.mean(df['one'].to_numpy())
# TODO: Why nan?

nan

In [None]:
np.mean(df['two'].to_numpy())

0.46430307373592017

## describe function

In [None]:
df = pd.DataFrame({'a':['Yes','Yes','No','No'],'b':range(4)})
df

Unnamed: 0,a,b
0,Yes,0
1,Yes,1
2,No,2
3,No,3


In [None]:
df.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [None]:
df.dtypes

a    object
b     int64
dtype: object

In [None]:
df.describe(include=['object','int64'])

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,Yes,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


In [None]:
df.describe(include='all')

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,Yes,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


### Index of min and max

In [None]:
df = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,1.386288,0.25598,-1.213172
1,2.706388,-1.424325,0.68669
2,0.021572,0.808615,0.267555
3,0.198408,0.379626,1.262623
4,-0.716454,-0.661946,0.200897


In [None]:
# minimum index value
df.idxmin()

A    4
B    1
C    0
dtype: int64

In [None]:
# maximum index value
df.idxmax()

A    1
B    2
C    3
dtype: int64

In [None]:
# minimum index value
df.idxmin(1)

0    C
1    B
2    A
3    A
4    A
dtype: object

In [None]:
df

Unnamed: 0,A,B,C
0,1.386288,0.25598,-1.213172
1,2.706388,-1.424325,0.68669
2,0.021572,0.808615,0.267555
3,0.198408,0.379626,1.262623
4,-0.716454,-0.661946,0.200897


In [None]:
# multiple matching values
df = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba'))
df

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [None]:
df['A'].idxmin() # equivalent in NumPy argmin and argmax

'd'

### value counts

In [None]:
#data = np.random.randint(0, 7, size=50)
#data
data = np.array([6, 3, 2, 6, 5, 6, 3, 1, 5, 5, 5, 1, 2, 3, 4, 2, 6, 3, 
5, 2, 0, 4, 0, 6, 1, 2, 6, 1, 3, 2, 0, 6, 1, 3, 4, 4, 1, 0, 
0, 5, 3, 4, 6, 4, 6, 6, 2, 4, 3, 1])
data

array([6, 3, 2, 6, 5, 6, 3, 1, 5, 5, 5, 1, 2, 3, 4, 2, 6, 3, 5, 2, 0, 4,
       0, 6, 1, 2, 6, 1, 3, 2, 0, 6, 1, 3, 4, 4, 1, 0, 0, 5, 3, 4, 6, 4,
       6, 6, 2, 4, 3, 1])

In [None]:
s = pd.Series(data)
s

In [None]:
s.value_counts()

In [None]:
df = pd.DataFrame({"A": np.random.randint(0, 7, size=50),"B": np.random.randint(-10, 15, size=50)})
df

In [None]:
df.mode()

In [None]:
df.median()

In [None]:
df.corr()

# Session 2

# (Handling) Missing Data

In [None]:
df = pd.DataFrame({
        'employee_name': ['Sam', 'Max', 'Tony', 'Sarah', 'Tania', 'David', 'Mark','Alice', 'Charles', 'Bob', 'Anna', np.nan, 'Tania'],
        'employee_dept': ['Research',np.nan,'Marketing','Sales', 'Finance', 'IT', 'HR', 'Marketing', 'IT', np.nan,'Sales', np.nan, 'Finance'],
        'employee_id' : [10001, 10002, np.nan, 10004, 10005, 10006, np.nan, 10008, 10009, 10010, 10011, np.nan, 10005],
        'salary'     : [np.nan, 65343.45, 53423.27, np.nan, 58753.00, 34323.44, 66544.60, np.nan, 55234.96, 39078.60, 44567.88, np.nan, 58753.0 ]
    })
df

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,Sam,Research,10001.0,
1,Max,,10002.0,65343.45
2,Tony,Marketing,,53423.27
3,Sarah,Sales,10004.0,
4,Tania,Finance,10005.0,58753.0
5,David,IT,10006.0,34323.44
6,Mark,HR,,66544.6
7,Alice,Marketing,10008.0,
8,Charles,IT,10009.0,55234.96
9,Bob,,10010.0,39078.6


In [None]:
# to find data with NaN values
df[df.isna().any(axis=1)]

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,Sam,Research,10001.0,
1,Max,,10002.0,65343.45
2,Tony,Marketing,,53423.27
3,Sarah,Sales,10004.0,
6,Mark,HR,,66544.6
7,Alice,Marketing,10008.0,
9,Bob,,10010.0,39078.6
11,,,,


In [None]:
df[df.isna().any(axis=0)]

  """Entry point for launching an IPython kernel.


IndexingError: ignored

In [None]:
df.isna().any(axis=0)

employee_name    True
employee_dept    True
employee_id      True
salary           True
dtype: bool

In [None]:
df.columns[df.isna().any(axis=0)]

Index(['employee_name', 'employee_dept', 'employee_id', 'salary'], dtype='object')

In [None]:
df.isna()

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,False,False,False,True
1,False,True,False,False
2,False,False,True,False
3,False,False,False,True
4,False,False,False,False
5,False,False,False,False
6,False,False,True,False
7,False,False,False,True
8,False,False,False,False
9,False,True,False,False


In [None]:
df.isna().any(axis=1)

0      True
1      True
2      True
3      True
4     False
5     False
6      True
7      True
8     False
9      True
10    False
11     True
12    False
dtype: bool

In [None]:
#df[df.isna()]

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,
6,,,,
7,,,,
8,,,,
9,,,,


In [None]:
df.isna()

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,False,False,False,True
1,False,True,False,False
2,False,False,True,False
3,False,False,False,True
4,False,False,False,False
5,False,False,False,False
6,False,False,True,False
7,False,False,False,True
8,False,False,False,False
9,False,True,False,False


In [None]:
df.isnull()

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,False,False,False,True
1,False,True,False,False
2,False,False,True,False
3,False,False,False,True
4,False,False,False,False
5,False,False,False,False
6,False,False,True,False
7,False,False,False,True
8,False,False,False,False
9,False,True,False,False


## Inseting missing data

In [None]:
df.loc[9,'salary'] = None
df

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,Sam,Research,10001.0,
1,Max,,10002.0,65343.45
2,Tony,Marketing,,53423.27
3,Sarah,Sales,10004.0,
4,Tania,Finance,10005.0,58753.0
5,David,IT,10006.0,34323.44
6,Mark,HR,,66544.6
7,Alice,Marketing,10008.0,
8,Charles,IT,10009.0,55234.96
9,Bob,,10010.0,


In [None]:
df_copy = df.copy()
df_copy.loc[10,'employee_name'] = None
df_copy

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,Sam,Research,10001.0,
1,Max,,10002.0,65343.45
2,Tony,Marketing,,53423.27
3,Sarah,Sales,10004.0,
4,Tania,Finance,10005.0,58753.0
5,David,IT,10006.0,34323.44
6,Mark,HR,,66544.6
7,Alice,Marketing,10008.0,
8,Charles,IT,10009.0,55234.96
9,Bob,,10010.0,


## Handling missing data

In [None]:
df.dropna()

Unnamed: 0,employee_name,employee_dept,employee_id,salary
4,Tania,Finance,10005.0,58753.0
5,David,IT,10006.0,34323.44
8,Charles,IT,10009.0,55234.96
10,Anna,Sales,10011.0,44567.88
12,Tania,Finance,10005.0,58753.0


In [None]:
df_copy.dropna()

Unnamed: 0,employee_name,employee_dept,employee_id,salary
4,Tania,Finance,10005.0,58753.0
5,David,IT,10006.0,34323.44
8,Charles,IT,10009.0,55234.96
12,Tania,Finance,10005.0,58753.0


In [None]:
df.dropna(how='all')

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,Sam,Research,10001.0,
1,Max,,10002.0,65343.45
2,Tony,Marketing,,53423.27
3,Sarah,Sales,10004.0,
4,Tania,Finance,10005.0,58753.0
5,David,IT,10006.0,34323.44
6,Mark,HR,,66544.6
7,Alice,Marketing,10008.0,
8,Charles,IT,10009.0,55234.96
9,Bob,,10010.0,


In [None]:
df.dropna(thresh=2)

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,Sam,Research,10001.0,
1,Max,,10002.0,65343.45
2,Tony,Marketing,,53423.27
3,Sarah,Sales,10004.0,
4,Tania,Finance,10005.0,58753.0
5,David,IT,10006.0,34323.44
6,Mark,HR,,66544.6
7,Alice,Marketing,10008.0,
8,Charles,IT,10009.0,55234.96
9,Bob,,10010.0,


## Filling in missing data with a value

In [None]:
df.fillna(value=5)

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,Sam,Research,10001.0,5.0
1,Max,5,10002.0,65343.45
2,Tony,Marketing,5.0,53423.27
3,Sarah,Sales,10004.0,5.0
4,Tania,Finance,10005.0,58753.0
5,David,IT,10006.0,34323.44
6,Mark,HR,5.0,66544.6
7,Alice,Marketing,10008.0,5.0
8,Charles,IT,10009.0,55234.96
9,Bob,5,10010.0,5.0


# checking for duplicates

In [None]:
df.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12     True
dtype: bool

In [None]:
df[df.duplicated()]

Unnamed: 0,employee_name,employee_dept,employee_id,salary
12,Tania,Finance,10005.0,58753.0


In [None]:
df.drop_duplicates()

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,Sam,Research,10001.0,
1,Max,,10002.0,65343.45
2,Tony,Marketing,,53423.27
3,Sarah,Sales,10004.0,
4,Tania,Finance,10005.0,58753.0
5,David,IT,10006.0,34323.44
6,Mark,HR,,66544.6
7,Alice,Marketing,10008.0,
8,Charles,IT,10009.0,55234.96
9,Bob,,10010.0,


In [None]:
df

Unnamed: 0,employee_name,employee_dept,employee_id,salary
0,Sam,Research,10001.0,
1,Max,,10002.0,65343.45
2,Tony,Marketing,,53423.27
3,Sarah,Sales,10004.0,
4,Tania,Finance,10005.0,58753.0
5,David,IT,10006.0,34323.44
6,Mark,HR,,66544.6
7,Alice,Marketing,10008.0,
8,Charles,IT,10009.0,55234.96
9,Bob,,10010.0,
