In [1]:
import pandas as pd
import numpy as np

In [3]:
#read a csv file // importing data from a csv file
# read_csv('<relative-path>') eg: read_csv('users/downloads/fraudData.csv')

df = pd.read_csv('fraudData.csv')

# Information about the dataset


In [5]:

df.shape

#there are 11 rows and 9 columns

(11, 9)

In [6]:
df.info()

#there 11 rows for custID, gender, state, etc...
#but there are only 9 rows for cardholder and balance.. This means there are 2 null values..

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 9 columns):
custID          11 non-null int64
gender          11 non-null int64
state           11 non-null int64
cardholder      9 non-null float64
balance         9 non-null float64
numTrans        11 non-null int64
numIntlTrans    11 non-null int64
creditLine      11 non-null int64
fraudRisk       11 non-null int64
dtypes: float64(2), int64(7)
memory usage: 872.0 bytes


In [7]:
#to know the number of null values

df.isna().sum()

custID          0
gender          0
state           0
cardholder      2
balance         2
numTrans        0
numIntlTrans    0
creditLine      0
fraudRisk       0
dtype: int64

In [13]:
#by adding dropna = False, we get the null value NaN count
#NaN means no value or empty cell |||| NaN- Not-A-Number

df['cardholder'].value_counts(dropna = False)

#Value counts is generally used for discrete data - eg: Yes/No

1.0    8
NaN    2
2.0    1
Name: cardholder, dtype: int64

In [14]:
df

Unnamed: 0,custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRisk
0,1,1,35,1.0,3000.0,4,14,2,0
1,2,2,2,1.0,0.0,9,0,18,0
2,3,2,2,1.0,0.0,27,9,16,0
3,4,1,15,1.0,,12,0,5,0
4,5,1,46,1.0,0.0,11,16,7,1
5,6,2,44,2.0,5546.0,21,0,13,0
6,7,1,3,,,41,0,1,0
7,8,1,10,1.0,6016.0,20,3,6,0
8,9,2,32,1.0,2428.0,4,10,22,0
9,10,1,23,1.0,0.0,18,56,5,1


# Selection

In [16]:
df.iloc[1,2]

# 1st row and 2nd column 


0     35
1      2
2      2
3     15
4     46
5     44
6      3
7     10
8     32
9     23
10    46
Name: state, dtype: int64

In [17]:
df.iloc[:,2]

# : means all.. in the above, all rows and 2 column


0     35
1      2
2      2
3     15
4     46
5     44
6      3
7     10
8     32
9     23
10    46
Name: state, dtype: int64

In [19]:
df.iloc[:,1:4]

# all rows and columns from 1 to 4 (1,2,3)


Unnamed: 0,gender,state,cardholder
0,1,35,1.0
1,2,2,1.0
2,2,2,1.0
3,1,15,1.0
4,1,46,1.0
5,2,44,2.0
6,1,3,
7,1,10,1.0
8,2,32,1.0
9,1,23,1.0


In [21]:
df.iloc[1:5,1:4]

# rows from 1 to 5 (1,2,3,4) and columns from 1 to 4 (1,2,3)

Unnamed: 0,gender,state,cardholder
1,2,2,1.0
2,2,2,1.0
3,1,15,1.0
4,1,46,1.0


In [25]:
#df['<columnName>']
#df['<colName','colName']

df[['gender','cardholder']]


Unnamed: 0,gender,cardholder
0,1,1.0
1,2,1.0
2,2,1.0
3,1,1.0
4,1,1.0
5,2,2.0
6,1,
7,1,1.0
8,2,1.0
9,1,1.0


In [26]:
df.iloc[:,[0,4]]

#all rows and columns 0 and 4

Unnamed: 0,custID,balance
0,1,3000.0
1,2,0.0
2,3,0.0
3,4,
4,5,0.0
5,6,5546.0
6,7,
7,8,6016.0
8,9,2428.0
9,10,0.0


# Cleaning

In [32]:
#dropping the null values

a = df.dropna()
b = df.dropna(axis=0)
c = df.dropna(axis=1)

print(a)
print(a.shape)
print('xxxxxxxxxxxxxxxxxxxx')
print(b)
print(b.shape)
print('xxxxxxxxxxxxxxxxxxxx')
print(c)
print(c.shape)

#a and b will hold the same result.. dropna() holds the parameter axis=0 as its default parameter
#axis=0 will delete row-wise- means deletes the entire row which contains nullvalues
#axis=1 will delete column-wise means deletes the entire column which contains nullvalues

   custID  gender  state  cardholder  balance  numTrans  numIntlTrans  \
0       1       1     35         1.0   3000.0         4            14   
1       2       2      2         1.0      0.0         9             0   
2       3       2      2         1.0      0.0        27             9   
4       5       1     46         1.0      0.0        11            16   
5       6       2     44         2.0   5546.0        21             0   
7       8       1     10         1.0   6016.0        20             3   
8       9       2     32         1.0   2428.0         4            10   
9      10       1     23         1.0      0.0        18            56   

   creditLine  fraudRisk  
0           2          0  
1          18          0  
2          16          0  
4           7          1  
5          13          0  
7           6          0  
8          22          0  
9           5          1  
(8, 9)
xxxxxxxxxxxxxxxxxxxx
   custID  gender  state  cardholder  balance  numTrans  numIntlTrans  

In [33]:
d = df.dropna(subset = ['balance'])
print(d)
print(d.shape)

#will delete the rows which contain null values in the column - 'balance'.. 
#Rows containing null values in other columns will not be deleted


    custID  gender  state  cardholder  balance  numTrans  numIntlTrans  \
0        1       1     35         1.0   3000.0         4            14   
1        2       2      2         1.0      0.0         9             0   
2        3       2      2         1.0      0.0        27             9   
4        5       1     46         1.0      0.0        11            16   
5        6       2     44         2.0   5546.0        21             0   
7        8       1     10         1.0   6016.0        20             3   
8        9       2     32         1.0   2428.0         4            10   
9       10       1     23         1.0      0.0        18            56   
10      11       1     46         NaN   4601.0        54             0   

    creditLine  fraudRisk  
0            2          0  
1           18          0  
2           16          0  
4            7          1  
5           13          0  
7            6          0  
8           22          0  
9            5          1  
10     

In [34]:
#Fillna - replace null values with a new value

e = df.fillna(1000)
e

Unnamed: 0,custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRisk
0,1,1,35,1.0,3000.0,4,14,2,0
1,2,2,2,1.0,0.0,9,0,18,0
2,3,2,2,1.0,0.0,27,9,16,0
3,4,1,15,1.0,1000.0,12,0,5,0
4,5,1,46,1.0,0.0,11,16,7,1
5,6,2,44,2.0,5546.0,21,0,13,0
6,7,1,3,1000.0,1000.0,41,0,1,0
7,8,1,10,1.0,6016.0,20,3,6,0
8,9,2,32,1.0,2428.0,4,10,22,0
9,10,1,23,1.0,0.0,18,56,5,1


In [35]:
#replace null values with column mean

f = df.fillna(df.mean())
f

Unnamed: 0,custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRisk
0,1,1,35,1.0,3000.0,4,14,2,0
1,2,2,2,1.0,0.0,9,0,18,0
2,3,2,2,1.0,0.0,27,9,16,0
3,4,1,15,1.0,2399.0,12,0,5,0
4,5,1,46,1.0,0.0,11,16,7,1
5,6,2,44,2.0,5546.0,21,0,13,0
6,7,1,3,1.111111,2399.0,41,0,1,0
7,8,1,10,1.0,6016.0,20,3,6,0
8,9,2,32,1.0,2428.0,4,10,22,0
9,10,1,23,1.0,0.0,18,56,5,1


In [38]:
df.iloc[6,3] = 2
df

Unnamed: 0,custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRisk
0,1,1,35,1.0,3000.0,4,14,2,0
1,2,2,2,1.0,0.0,9,0,18,0
2,3,2,2,1.0,0.0,27,9,16,0
3,4,1,15,1.0,,12,0,5,0
4,5,1,46,1.0,0.0,11,16,7,1
5,6,2,44,2.0,5546.0,21,0,13,0
6,7,1,3,2.0,,41,0,1,0
7,8,1,10,1.0,6016.0,20,3,6,0
8,9,2,32,1.0,2428.0,4,10,22,0
9,10,1,23,1.0,0.0,18,56,5,1


In [41]:
df.iloc[6,3] = 'hai'
df.iloc[10,3] = 'hai'
df

Unnamed: 0,custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRisk
0,1,1,35,1,3000.0,4,14,2,0
1,2,2,2,1,0.0,9,0,18,0
2,3,2,2,1,0.0,27,9,16,0
3,4,1,15,1,,12,0,5,0
4,5,1,46,1,0.0,11,16,7,1
5,6,2,44,2,5546.0,21,0,13,0
6,7,1,3,hai,,41,0,1,0
7,8,1,10,1,6016.0,20,3,6,0
8,9,2,32,1,2428.0,4,10,22,0
9,10,1,23,1,0.0,18,56,5,1


In [45]:
#replace string value - 'hai' with the column mean

# df.replace('hai',df.mean()) will not work.. we cannot change the value from a datatype(string) to another datatype(int) directly.

g = df.replace('hai',np.nan)
g = g.fillna(g.mean())
g

#hence convert the string to NaN and then replace NaN with the mean value

Unnamed: 0,custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRisk
0,1,1,35,1.0,3000.0,4,14,2,0
1,2,2,2,1.0,0.0,9,0,18,0
2,3,2,2,1.0,0.0,27,9,16,0
3,4,1,15,1.0,2399.0,12,0,5,0
4,5,1,46,1.0,0.0,11,16,7,1
5,6,2,44,2.0,5546.0,21,0,13,0
6,7,1,3,1.111111,2399.0,41,0,1,0
7,8,1,10,1.0,6016.0,20,3,6,0
8,9,2,32,1.0,2428.0,4,10,22,0
9,10,1,23,1.0,0.0,18,56,5,1


In [52]:
df.replace('hai',np.nan) #this doesn't change the value in the actual variable 'df' without assigning to a variable
df = df.replace('hai',np.nan) #this works



Unnamed: 0,custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRisk
0,1,1,35,1.0,3000.0,4,14,2,0
1,2,2,2,1.0,0.0,9,0,18,0
2,3,2,2,1.0,0.0,27,9,16,0
3,4,1,15,1.0,,12,0,5,0
4,5,1,46,1.0,0.0,11,16,7,1
5,6,2,44,2.0,5546.0,21,0,13,0
6,7,1,3,,,41,0,1,0
7,8,1,10,1.0,6016.0,20,3,6,0
8,9,2,32,1.0,2428.0,4,10,22,0
9,10,1,23,1.0,0.0,18,56,5,1


In [55]:
#can replace the values by mentioning the specific column name

df['cardholder'].replace('hai',0)
df['balance'].fillna('N/A')

0     3000
1        0
2        0
3      N/A
4        0
5     5546
6      N/A
7     6016
8     2428
9        0
10    4601
Name: balance, dtype: object

In [56]:
df.isna()

Unnamed: 0,custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRisk
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False
6,False,False,False,True,True,False,False,False,False
7,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False
