### Random Sampling

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(r'https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/Property_Crimes.csv')

In [3]:
df.head()

Unnamed: 0,Area_Name,Year,Group_Name,Sub_Group_Name,Cases_Property_Recovered,Cases_Property_Stolen,Value_of_Property_Recovered,Value_of_Property_Stolen
0,Andaman & Nicobar Islands,2001,Burglary - Property,3. Burglary,27,64,755858,1321961
1,Andhra Pradesh,2001,Burglary - Property,3. Burglary,3321,7134,51483437,147019348
2,Arunachal Pradesh,2001,Burglary - Property,3. Burglary,66,248,825115,4931904
3,Assam,2001,Burglary - Property,3. Burglary,539,2423,3722850,21466955
4,Bihar,2001,Burglary - Property,3. Burglary,367,3231,2327135,17023937


#### Random sample 70% without replacement

In [8]:
nrows = df.shape[0]
nrows

2449

In [10]:
df_sample = df.sample(frac=0.70, replace=False, random_state=100)
df_sample.shape

(1714, 8)

#### Bootstrap sample

In [11]:
# randomly pick same no. of rows as in dataset but with replacement
bootstrap_sample = df.sample(frac=1, replace=True, random_state=100)
bootstrap_sample.shape

(2449, 8)

#### Challenge
- Calculate 95% Confidence Interval of the means for the following -


In [12]:
np.random.seed(100)
arr = pd.Series(np.random.normal(10, 3, (100)))
arr.head()

0     4.750704
1    11.028041
2    13.459107
3     9.242692
4    12.943962
dtype: float64

In [13]:
type(arr)

pandas.core.series.Series

In [19]:
# bootstrap sample a large no. of times say 10000
means = []
for i in range(10000):
    means.append(np.mean(arr.sample(frac=1, replace=True)))

# sort
means = pd.Series(sorted(means))

print(np.percentile(means, 2.5), np.percentile(means, 97.5))

9.117631418512076 10.263385872742084


### Dummy Variables

In [21]:
df.Group_Name.unique()

array(['Burglary - Property', 'Criminal Breach of Trust - Property',
       'Dacoity -Property', 'Other heads of Property',
       'Robbery - Property', 'Theft - Property', 'Total Property'],
      dtype=object)

In [24]:
pd.get_dummies(df.Group_Name, prefix='Group').head()

Unnamed: 0,Group_Burglary - Property,Group_Criminal Breach of Trust - Property,Group_Dacoity -Property,Group_Other heads of Property,Group_Robbery - Property,Group_Theft - Property,Group_Total Property
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0


### Categorical Data

In [34]:
group_name_cat = df.Group_Name.astype('category')

In [39]:
type(group_name_cat)

pandas.core.series.Series

In [36]:
group_name_cat.head()

0    Burglary - Property
1    Burglary - Property
2    Burglary - Property
3    Burglary - Property
4    Burglary - Property
Name: Group_Name, dtype: category
Categories (7, object): ['Burglary - Property', 'Criminal Breach of Trust - Property', 'Dacoity -Property', 'Other heads of Property', 'Robbery - Property', 'Theft - Property', 'Total Property']

In [40]:
group_name_cat1 = pd.Categorical(df.Group_Name, df.Group_Name.unique())
group_name_cat1

['Burglary - Property', 'Burglary - Property', 'Burglary - Property', 'Burglary - Property', 'Burglary - Property', ..., 'Total Property', 'Total Property', 'Total Property', 'Total Property', 'Total Property']
Length: 2449
Categories (7, object): ['Burglary - Property', 'Criminal Breach of Trust - Property', 'Dacoity -Property', 'Other heads of Property', 'Robbery - Property', 'Theft - Property', 'Total Property']

In [41]:
type(group_name_cat1)

pandas.core.arrays.categorical.Categorical

In [42]:
group_name_cat1.codes

array([0, 0, 0, ..., 6, 6, 6], dtype=int8)

In [43]:
group_name_cat1.categories

Index(['Burglary - Property', 'Criminal Breach of Trust - Property',
       'Dacoity -Property', 'Other heads of Property', 'Robbery - Property',
       'Theft - Property', 'Total Property'],
      dtype='object')

In [44]:
group_name_cat.cat.categories

Index(['Burglary - Property', 'Criminal Breach of Trust - Property',
       'Dacoity -Property', 'Other heads of Property', 'Robbery - Property',
       'Theft - Property', 'Total Property'],
      dtype='object')

In [46]:
df.head(3)

Unnamed: 0,Area_Name,Year,Group_Name,Sub_Group_Name,Cases_Property_Recovered,Cases_Property_Stolen,Value_of_Property_Recovered,Value_of_Property_Stolen
0,Andaman & Nicobar Islands,2001,Burglary - Property,3. Burglary,27,64,755858,1321961
1,Andhra Pradesh,2001,Burglary - Property,3. Burglary,3321,7134,51483437,147019348
2,Arunachal Pradesh,2001,Burglary - Property,3. Burglary,66,248,825115,4931904


In [47]:
df.Sub_Group_Name.unique()

array(['3. Burglary', '5. Criminal Breach of Trust', '1. Dacoity',
       '6. Other Property', '2. Robbery', '4. Theft',
       '7. Total Property Stolen & Recovered'], dtype=object)

In [51]:
subgroup_cat = df.Sub_Group_Name.astype('category')

In [52]:
subgroup_cat.cat.categories

Index(['1. Dacoity', '2. Robbery', '3. Burglary', '4. Theft',
       '5. Criminal Breach of Trust', '6. Other Property',
       '7. Total Property Stolen & Recovered'],
      dtype='object')

In [54]:
subgroup_cat.cat.add_categories('missing')

0                                3. Burglary
1                                3. Burglary
2                                3. Burglary
3                                3. Burglary
4                                3. Burglary
                        ...                 
2444    7. Total Property Stolen & Recovered
2445    7. Total Property Stolen & Recovered
2446    7. Total Property Stolen & Recovered
2447    7. Total Property Stolen & Recovered
2448    7. Total Property Stolen & Recovered
Name: Sub_Group_Name, Length: 2449, dtype: category
Categories (8, object): ['1. Dacoity', '2. Robbery', '3. Burglary', '4. Theft', '5. Criminal Breach of Trust', '6. Other Property', '7. Total Property Stolen & Recovered', 'missing']

#### add categories

In [55]:
subgroup_cat = subgroup_cat.cat.add_categories('missing')

In [56]:
subgroup_cat.value_counts()

1. Dacoity                              350
2. Robbery                              350
3. Burglary                             350
4. Theft                                350
5. Criminal Breach of Trust             350
6. Other Property                       350
7. Total Property Stolen & Recovered    349
missing                                   0
Name: Sub_Group_Name, dtype: int64

#### remove unused categories

In [59]:
subgroup_cat = subgroup_cat.cat.remove_unused_categories()

In [60]:
subgroup_cat.value_counts()

1. Dacoity                              350
2. Robbery                              350
3. Burglary                             350
4. Theft                                350
5. Criminal Breach of Trust             350
6. Other Property                       350
7. Total Property Stolen & Recovered    349
Name: Sub_Group_Name, dtype: int64

#### remove used categories

In [62]:
subgroup_cat.cat.remove_categories('3. Burglary')

0                                        NaN
1                                        NaN
2                                        NaN
3                                        NaN
4                                        NaN
                        ...                 
2444    7. Total Property Stolen & Recovered
2445    7. Total Property Stolen & Recovered
2446    7. Total Property Stolen & Recovered
2447    7. Total Property Stolen & Recovered
2448    7. Total Property Stolen & Recovered
Name: Sub_Group_Name, Length: 2449, dtype: category
Categories (6, object): ['1. Dacoity', '2. Robbery', '4. Theft', '5. Criminal Breach of Trust', '6. Other Property', '7. Total Property Stolen & Recovered']

In [66]:
subgroup_ordcat = pd.Series(pd.Categorical(df.Sub_Group_Name, ordered=True))

In [67]:
subgroup_ordcat.cat.categories

Index(['1. Dacoity', '2. Robbery', '3. Burglary', '4. Theft',
       '5. Criminal Breach of Trust', '6. Other Property',
       '7. Total Property Stolen & Recovered'],
      dtype='object')

In [71]:
subgroup_ordcat.head(3)

0    3. Burglary
1    3. Burglary
2    3. Burglary
dtype: category
Categories (7, object): ['1. Dacoity' < '2. Robbery' < '3. Burglary' < '4. Theft' < '5. Criminal Breach of Trust' < '6. Other Property' < '7. Total Property Stolen & Recovered']

In [72]:
subgroup_ordcat.sort_values()

874                               1. Dacoity
938                               1. Dacoity
937                               1. Dacoity
936                               1. Dacoity
935                               1. Dacoity
                        ...                 
2212    7. Total Property Stolen & Recovered
2211    7. Total Property Stolen & Recovered
2210    7. Total Property Stolen & Recovered
2218    7. Total Property Stolen & Recovered
2448    7. Total Property Stolen & Recovered
Length: 2449, dtype: category
Categories (7, object): ['1. Dacoity' < '2. Robbery' < '3. Burglary' < '4. Theft' < '5. Criminal Breach of Trust' < '6. Other Property' < '7. Total Property Stolen & Recovered']

In [75]:
grades = pd.Series(['A','A+','B', 'C', 'Excellence','A','A+','B+'])
grades

0             A
1            A+
2             B
3             C
4    Excellence
5             A
6            A+
7            B+
dtype: object

In [82]:
grades_cat = pd.Series(pd.Categorical(grades))
grades_cat

0             A
1            A+
2             B
3             C
4    Excellence
5             A
6            A+
7            B+
dtype: category
Categories (6, object): ['A', 'A+', 'B', 'B+', 'C', 'Excellence']

In [83]:
grades_cat1 = pd.Series(pd.Categorical(grades, ordered=True))
grades_cat1

0             A
1            A+
2             B
3             C
4    Excellence
5             A
6            A+
7            B+
dtype: category
Categories (6, object): ['A' < 'A+' < 'B' < 'B+' < 'C' < 'Excellence']

In [84]:
grades_cat1.cat.categories

Index(['A', 'A+', 'B', 'B+', 'C', 'Excellence'], dtype='object')

In [86]:
grades_cat1.sort_values()

0             A
5             A
1            A+
6            A+
2             B
7            B+
3             C
4    Excellence
dtype: category
Categories (6, object): ['A' < 'A+' < 'B' < 'B+' < 'C' < 'Excellence']

In [87]:
grades_cat1 = grades_cat1.cat.reorder_categories(['C','B','B+','A','A+','Excellence'])
grades_cat1

0             A
1            A+
2             B
3             C
4    Excellence
5             A
6            A+
7            B+
dtype: category
Categories (6, object): ['C' < 'B' < 'B+' < 'A' < 'A+' < 'Excellence']

In [88]:
grades_cat1.sort_values()

3             C
2             B
7            B+
0             A
5             A
1            A+
6            A+
4    Excellence
dtype: category
Categories (6, object): ['C' < 'B' < 'B+' < 'A' < 'A+' < 'Excellence']

In [90]:
grades_cat1.cat.codes

0    3
1    4
2    1
3    0
4    5
5    3
6    4
7    2
dtype: int8