In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
df.head()

Unnamed: 0,Ethnicity,Time,Time_Type,Region,Age,Value,Numerator
0,All,2004,year,All,16-24,59.7,3934400
1,All,2004,year,East Midlands,16-24,61.4,300300
2,All,2004,year,East of England,16-24,65.8,371600
3,All,2004,year,London,16-24,47.2,424100
4,All,2004,year,North East,16-24,58.6,176600


### Basics

In [3]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
# get data types
print('\n')
print(df.dtypes)

# number of rows in dataframe
df.shape[0]
r, c = df.shape
print('\n')
print(r)

# columns
print('\n')
print(df.columns)

# info
print('\n')
print(df.info())

# basic stats
print('\n')
print(df.describe())



Ethnicity    object
Time          int64
Time_Type    object
Region       object
Age          object
Value        object
Numerator    object
dtype: object


28080


Index(['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value',
       'Numerator'],
      dtype='object')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28080 entries, 0 to 28079
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Ethnicity  28080 non-null  object
 1   Time       28080 non-null  int64 
 2   Time_Type  28080 non-null  object
 3   Region     28080 non-null  object
 4   Age        28080 non-null  object
 5   Value      28080 non-null  object
 6   Numerator  28080 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.5+ MB
None


               Time
count  28080.000000
mean    2011.000000
std        4.320571
min     2004.000000
25%     2007.000000
50%     2011.000000
75%     2015.000000
max     2018.000000


### Drop row/column

In [4]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
print(df.head())

df.drop(df[df['Ethnicity'] == 'All'].index, inplace=True)
print('\n')
print(df.head())

df.drop('Numerator', axis=1, inplace=True)
print('\n')
print(df.head())

  Ethnicity  Time Time_Type           Region    Age Value Numerator
0       All  2004      year              All  16-24  59.7   3934400
1       All  2004      year    East Midlands  16-24  61.4    300300
2       All  2004      year  East of England  16-24  65.8    371600
3       All  2004      year           London  16-24  47.2    424100
4       All  2004      year       North East  16-24  58.6    176600


    Ethnicity  Time Time_Type           Region    Age Value Numerator
144     Asian  2004      year              All  16-24  37.1    178600
145     Asian  2004      year    East Midlands  16-24    37     12600
146     Asian  2004      year  East of England  16-24  45.4     10300
147     Asian  2004      year           London  16-24  37.9     65500
148     Asian  2004      year       North East  16-24     ?         ?


    Ethnicity  Time Time_Type           Region    Age Value
144     Asian  2004      year              All  16-24  37.1
145     Asian  2004      year    East Midlands  

### Change data types

In [5]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
df = df.astype({'Age': 'category'})
print(df.dtypes)

age_dtype = pd.api.types.CategoricalDtype(categories=['16-24', '25-49', '50-64', 'All'], ordered=True)
df = df.astype({'Age': age_dtype})
print(df.dtypes)

Ethnicity      object
Time            int64
Time_Type      object
Region         object
Age          category
Value          object
Numerator      object
dtype: object
Ethnicity      object
Time            int64
Time_Type      object
Region         object
Age          category
Value          object
Numerator      object
dtype: object


### Rename columns

In [6]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
df.rename(columns={'Time_Type': 'Time_Units'}, inplace=True)
df.head()

Unnamed: 0,Ethnicity,Time,Time_Units,Region,Age,Value,Numerator
0,All,2004,year,All,16-24,59.7,3934400
1,All,2004,year,East Midlands,16-24,61.4,300300
2,All,2004,year,East of England,16-24,65.8,371600
3,All,2004,year,London,16-24,47.2,424100
4,All,2004,year,North East,16-24,58.6,176600


### Copy dataframe

In [7]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )

print('\nShallow copy-rename')
dfb = df.copy(deep=False)
dfb.iloc[0, 0] = 'New'
print('dfb')
print(dfb.head())
print('df')
print(df.head())

df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
print('\nDeep copy-rename')
dfb = df.copy(deep=True)
dfb.iloc[0, 0] = 'New'
print('dfb')
print(dfb.head())
print('df')
print(df.head())


Shallow copy-rename
dfb
  Ethnicity  Time Time_Type           Region    Age Value Numerator
0       New  2004      year              All  16-24  59.7   3934400
1       All  2004      year    East Midlands  16-24  61.4    300300
2       All  2004      year  East of England  16-24  65.8    371600
3       All  2004      year           London  16-24  47.2    424100
4       All  2004      year       North East  16-24  58.6    176600
df
  Ethnicity  Time Time_Type           Region    Age Value Numerator
0       New  2004      year              All  16-24  59.7   3934400
1       All  2004      year    East Midlands  16-24  61.4    300300
2       All  2004      year  East of England  16-24  65.8    371600
3       All  2004      year           London  16-24  47.2    424100
4       All  2004      year       North East  16-24  58.6    176600

Deep copy-rename
dfb
  Ethnicity  Time Time_Type           Region    Age Value Numerator
0       New  2004      year              All  16-24  59.7   393440

### Working with indexes

In [8]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
# value of index
print(df.index.values[0])

# set index
df = df.set_index('Ethnicity')
print(df.head())

df = df.reset_index()
print(df.head())

0
           Time Time_Type           Region    Age Value Numerator
Ethnicity                                                        
All        2004      year              All  16-24  59.7   3934400
All        2004      year    East Midlands  16-24  61.4    300300
All        2004      year  East of England  16-24  65.8    371600
All        2004      year           London  16-24  47.2    424100
All        2004      year       North East  16-24  58.6    176600
  Ethnicity  Time Time_Type           Region    Age Value Numerator
0       All  2004      year              All  16-24  59.7   3934400
1       All  2004      year    East Midlands  16-24  61.4    300300
2       All  2004      year  East of England  16-24  65.8    371600
3       All  2004      year           London  16-24  47.2    424100
4       All  2004      year       North East  16-24  58.6    176600


### Sort values

In [9]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
df.sort_values(by=['Region'], ascending=False)
df.head(20)

Unnamed: 0,Ethnicity,Time,Time_Type,Region,Age,Value,Numerator
0,All,2004,year,All,16-24,59.7,3934400
1,All,2004,year,East Midlands,16-24,61.4,300300
2,All,2004,year,East of England,16-24,65.8,371600
3,All,2004,year,London,16-24,47.2,424100
4,All,2004,year,North East,16-24,58.6,176600
5,All,2004,year,North West,16-24,57.2,457900
6,All,2004,year,Scotland,16-24,63.4,369600
7,All,2004,year,South East,16-24,65.3,564300
8,All,2004,year,South West,16-24,66.1,345500
9,All,2004,year,Wales,16-24,58.2,197700


### Correlation

In [10]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
column_1 = df.loc[(df['Numerator'] != '?') 
                  & (df['Numerator'] != '-') 
                  & (df['Value'] != '?'), 'Numerator'].astype('float32')
column_2 = df.loc[(df['Numerator'] != '?') 
                  & (df['Numerator'] != '-') 
                  & (df['Value'] != '?'), 'Value'].astype('float32')

column_1.corr(column_2)

0.18659113253375414

### Cut

In [11]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
column_1 = df.loc[(df['Numerator'] != '?') 
                  & (df['Numerator'] != '-') 
                  & (df['Value'] != '?'), 'Numerator'].astype('float32')
df = df.loc[(df['Numerator'] != '?') 
                  & (df['Numerator'] != '-') 
                  & (df['Value'] != '?'), ['Numerator', 'Value']].astype('float32')

df['bins'] = pd.cut(column_1, bins=5)
bins = sorted(list(df['bins'].unique()))
for bin in bins:
    print(df.loc[df['bins'] == bin, ['Numerator', 'bins']].head())
    print('\n')
# df.loc[:,['Numerator', 'bins']].sort_values(by='bins')

   Numerator                   bins
0  3934400.0  (-29315.8, 6023960.0]
1   300300.0  (-29315.8, 6023960.0]
2   371600.0  (-29315.8, 6023960.0]
3   424100.0  (-29315.8, 6023960.0]
4   176600.0  (-29315.8, 6023960.0]


      Numerator                     bins
48    8915700.0  (6023960.0, 12047120.0]
60    7623300.0  (6023960.0, 12047120.0]
72    6584500.0  (6023960.0, 12047120.0]
1488  8082600.0  (6023960.0, 12047120.0]
1500  7014200.0  (6023960.0, 12047120.0]


       Numerator                      bins
36    16539000.0  (12047120.0, 18070280.0]
120   14595800.0  (12047120.0, 18070280.0]
132   12462100.0  (12047120.0, 18070280.0]
1476  15096800.0  (12047120.0, 18070280.0]
1560  13431400.0  (12047120.0, 18070280.0]


       Numerator                      bins
1692  23746800.0  (18070280.0, 24093440.0]
3564  23787200.0  (18070280.0, 24093440.0]
5436  23608000.0  (18070280.0, 24093440.0]
7308  23632100.0  (18070280.0, 24093440.0]
9180  23532100.0  (18070280.0, 24093440.0]


       Numerat

### Convert categorical values into individual columns

In [12]:
cols = ['Time', 'Region']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
print(df.head())
df = pd.get_dummies(df, prefix='Region', columns=['Region'])
print(df.head())

   Time           Region
0  2004              All
1  2004    East Midlands
2  2004  East of England
3  2004           London
4  2004       North East
   Time  Region_All  Region_East Midlands  Region_East of England  \
0  2004           1                     0                       0   
1  2004           0                     1                       0   
2  2004           0                     0                       1   
3  2004           0                     0                       0   
4  2004           0                     0                       0   

   Region_London  Region_North East  Region_North West  Region_Scotland  \
0              0                  0                  0                0   
1              0                  0                  0                0   
2              0                  0                  0                0   
3              1                  0                  0                0   
4              0                  1                  0      

### Pivot table

In [13]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
df.pivot_table(values='Value', index='Ethnicity', columns='Region', aggfunc=sum).head()

Region,All,East Midlands,East of England,London,North East,North West,Scotland,South East,South West,Wales,West Midlands,Yorkshire and The Humber
Ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
All,59.76257.581.188.473.964.271.856.772.679.166.3...,61.463.85982.589.675.664.372.556.173.58067.161...,65.868.663.184.192.476.169.377.661.276.984.269...,47.248.745.775.884.966.563.271.555.568.175.860...,58.661.256.178.885.272.654.560.748.568.173.662...,57.259.554.980.587.174.160.566.754.370.676.265...,63.465.161.781.286.875.961.968.955.372.677.867...,65.366.763.884.492.176.970.379.161.977.284.170...,66.168.363.984.190.677.76774.659.775.881.969.9...,58.26155.479.886.473.657.864.351.569.37563.757...,58.361.455.280.387.972.863.671.655.671.678.564...,60.463.357.58187.574.763.171.155.372.278.366.1...
Asian,37.14033.86782.251.450.364.836.15768.744.836.3...,3737.4?70.684.757.150.9??59.568.949.541.7?35.6...,45.4??71.886.956.158.2??63.777.349.740.346.8?7...,37.939.735.667.181.852.152.365.340.35868.946.5...,???61.676.443.5???56.772.940.6???69.382.851.1?...,33.43333.960.282.237.442.3??50.365.434.935.141...,???62.871.951.9???55.565.643.1???66.380.451.9?...,38.939.338.478.492.665.460.272.249.466.375.957...,???78.3?67.9???7180.261.2???83.7?????72.479.56...,???71.5?????56.266.547.2???64.1?????52.5?42.3?...,32.836.229.662.876.148.344.358.52952.163.540.4...,32.739.724.459.780.638.541.5??4964.831.737.146...
Asian Other,33.628.839.966.977.257.264.475.255.558.463.453...,???70.7?????60.2???????????61.8???????????51.2...,???74.3?????6675.657.6???75.3?????67.875.460.6...,36.8??65.57951.367.6??59.466.85226.9??65.178.2...,?????????62.5???????????60.7??????????????????...,???60.1?????53?47.4???65.3?????62.271.352???68...,?????????45.7?????67.5?????60.1???????????52??...,???78.787.672.3???6666.965.326.6??77.987.470.7...,?????????72???????????69.4???????????71.4?????...,?????????????????????????????????55.7?????????...,???56?????45.9?????70.5?????56.5?????72.1?????...,?????????45.4?????????????????????????????????...
Black,35.637.334.167.673.562.661.968.6576065.155.735...,???63.4?????58.962.955.4???71?????63.366.659.8...,???83.584.382.8???78.980.877.1???81.884.979.1?...,31.934.929.665.372.759.859.367.453.757.363.652...,??????????????????????????????????????????????...,???62.8?????53.857.449.3???70.4?????62.363.661...,??????????????????????????????????????????????...,???76.780.572.6???71.376.665.8???82.384.380.3?...,?????????67.4???????????72.9???????????67.3???...,??????????????????????????????????????????????...,???70.373.667.5???61.962.561.5???68.774.463.6?...,?????????59.2?????72.8?????64.2?????71.8?????6...
Indian,44.246.241.879.689.369.956.770.142.167.975.959...,45.9??7789.365.850??65.273.956.541.9??76.884.5...,???86.8?????758564.2???85.391.4????78.285.170....,43.247.737.37988.469.759.270.846.667.775.659.2...,??????????????????????????????????????????????...,???72.891.955.4???62.973.852???71.989.352.8???...,?????????????????????????????????????????????7...,47.1??86.896.27764.7??75.18366.943.3??84.192.7...,?????????76.7???????????80.4??????????????????...,??????????????????????????????????????????????...,40.7??78.384.272.351.7??65.272.457.843.5??75.6...,???78.3?????65.773.4????82.2?????73.7?????80.4...


### Mapping

In [14]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
EthnicityDict  = {'Asian':'Asian', 
                  'Asian Other':'Asian', 
                  'Indian':'Asian', 
                  'Pakistani and Bangladeshi':'Asian', 

                  'Black':'Black', 

                  'Mixed':'Misc', 
                  'Other':'Misc', 
                  'Other than White':'Misc',
                  'Unknown':'Misc', 

                  'White':'White', 
                  'White British':'White', 
                  'White Other':'White',
                  }


df = df.loc[(df['Value'] != '?') & (df['Value'] != '-')]
df = df.loc[(df['Ethnicity'] != 'All')]
df['EthnicityCategory'] = list(pd.Series(df['Ethnicity']).map(EthnicityDict))
df.head()

Unnamed: 0,Ethnicity,Time,Time_Type,Region,Age,Value,Numerator,EthnicityCategory
144,Asian,2004,year,All,16-24,37.1,178600,Asian
145,Asian,2004,year,East Midlands,16-24,37.0,12600,Asian
146,Asian,2004,year,East of England,16-24,45.4,10300,Asian
147,Asian,2004,year,London,16-24,37.9,65500,Asian
149,Asian,2004,year,North West,16-24,33.4,17100,Asian


### Simple groupby

In [15]:
cities = [{'country': 'usa', 'state': 'ma', 'city': 'boston', 'population': 10},
          {'country': 'usa', 'state': 'ma', 'city': 'arlington', 'population': 8.5},
          {'country': 'usa', 'state': 'ma', 'city': 'medford', 'population': 7},
          {'country': 'usa', 'state': 'ny', 'city': 'nyc', 'population': 15},
          {'country': 'usa', 'state': 'ny', 'city': 'albany', 'population': 6},
         ]
df = pd.DataFrame(cities)
print(df)

for state, df in df.groupby('state'):
    print(state)
    print(df)
    print('\n')

  country state       city  population
0     usa    ma     boston        10.0
1     usa    ma  arlington         8.5
2     usa    ma    medford         7.0
3     usa    ny        nyc        15.0
4     usa    ny     albany         6.0
ma
  country state       city  population
0     usa    ma     boston        10.0
1     usa    ma  arlington         8.5
2     usa    ma    medford         7.0


ny
  country state    city  population
3     usa    ny     nyc        15.0
4     usa    ny  albany         6.0




### Groupby using function

In [16]:
cities = [{'country': 'usa', 'state': 'ma', 'city': 'boston', 'population': 10},
          {'country': 'usa', 'state': 'ma', 'city': 'arlington', 'population': 8.5},
          {'country': 'usa', 'state': 'ma', 'city': 'medford', 'population': 7},
          {'country': 'usa', 'state': 'or', 'city': 'portland', 'population': 7},
          {'country': 'usa', 'state': 'ny', 'city': 'nyc', 'population': 15},
          {'country': 'usa', 'state': 'ny', 'city': 'albany', 'population': 6},
         ]
df = pd.DataFrame(cities)
print(df)

df = df.set_index('state')

def fun(item):
    if item[0] <= 'm':
        return 0
    else:
        return 1
    
for state, df in df.groupby(fun):
    print(state)
    print(df)
    print('\n')

  country state       city  population
0     usa    ma     boston        10.0
1     usa    ma  arlington         8.5
2     usa    ma    medford         7.0
3     usa    or   portland         7.0
4     usa    ny        nyc        15.0
5     usa    ny     albany         6.0
0
      country       city  population
state                               
ma        usa     boston        10.0
ma        usa  arlington         8.5
ma        usa    medford         7.0


1
      country      city  population
state                              
or        usa  portland         7.0
ny        usa       nyc        15.0
ny        usa    albany         6.0




### Groupby Split-apply-combine

In [17]:
cities = [{'country': 'usa', 'state': 'ma', 'city': 'boston', 'population': 10},
          {'country': 'usa', 'state': 'ma', 'city': 'arlington', 'population': 8.5},
          {'country': 'usa', 'state': 'ma', 'city': 'medford', 'population': 7},
          {'country': 'usa', 'state': 'or', 'city': 'portland', 'population': 7},
          {'country': 'usa', 'state': 'ny', 'city': 'nyc', 'population': 15},
          {'country': 'usa', 'state': 'ny', 'city': 'albany', 'population': 6},
         ]
df = pd.DataFrame(cities)
print(df)
df.groupby('state').agg({'population': np.sum, 'city': np.min})

  country state       city  population
0     usa    ma     boston        10.0
1     usa    ma  arlington         8.5
2     usa    ma    medford         7.0
3     usa    or   portland         7.0
4     usa    ny        nyc        15.0
5     usa    ny     albany         6.0


Unnamed: 0_level_0,population,city
state,Unnamed: 1_level_1,Unnamed: 2_level_1
ma,25.5,arlington
ny,21.0,albany
or,7.0,portland


In [18]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
EthnicityDict  = {'Asian':'Asian', 
                  'Asian Other':'Asian', 
                  'Indian':'Asian', 
                  'Pakistani and Bangladeshi':'Asian', 

                  'Black':'Black', 

                  'Mixed':'Misc', 
                  'Other':'Misc', 
                  'Other than White':'Misc',
                  'Unknown':'Misc', 

                  'White':'White', 
                  'White British':'White', 
                  'White Other':'White',
                  }

df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
df = df.loc[(df['Numerator'] != '?') 
                  & (df['Numerator'] != '-') 
                  & (df['Value'] != '?'), :]
df['EthnicityCategory'] = list(pd.Series(df['Ethnicity']).map(EthnicityDict))
df['Value'] = df['Value'].astype('float')
df.groupby('EthnicityCategory').agg(
    {
        'Value':['sum', 'mean', 'std'],
        'Time':['size']
    })

Unnamed: 0_level_0,Value,Value,Value,Time
Unnamed: 0_level_1,sum,mean,std,size
EthnicityCategory,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Asian,217816.8,58.949066,18.159675,3695
Black,48045.5,63.636424,14.12841,755
Misc,171189.1,57.561903,15.711165,2974
White,387520.8,71.300975,10.995605,5435


### Merge - use when joining on columns

In [19]:
df = pd.DataFrame(data=[['Japan', 'Tokyo', 'Japanese']
                        , ['USA', 'Washington DC', 'English']
                        , ['Germany', 'Berlin', 'German']
                       ]
                  , columns=['Country', 'City', 'Language']
                 )
print(df.head())

population = [{'Country': 'USA', 'Population': 10000, 'City': 'NYC'}
              , {'Country': 'England', 'Population': 10, 'City': 'Manchester'}
              , {'Country': 'Mexico', 'Population': 1, 'City': 'Mexico City'}
             ]
df_pop = pd.DataFrame(population)
print('\n{}'.format(df_pop.head()))

df3 = pd.merge(df, df_pop
               , left_on='Country'
               , right_on='Country'
               , suffixes=('_x', '_y')
              )
print('\n{}'.format(df3.head()))

df3 = pd.merge(df, df_pop
               , how='left'
               , left_on='Country'
               , right_on='Country'
               , suffixes=('_x', '_y')
              )
print('\n{}'.format(df3.head()))

   Country           City  Language
0    Japan          Tokyo  Japanese
1      USA  Washington DC   English
2  Germany         Berlin    German

   Country  Population         City
0      USA       10000          NYC
1  England          10   Manchester
2   Mexico           1  Mexico City

  Country         City_x Language  Population City_y
0     USA  Washington DC  English       10000    NYC

   Country         City_x  Language  Population City_y
0    Japan          Tokyo  Japanese         NaN    NaN
1      USA  Washington DC   English     10000.0    NYC
2  Germany         Berlin    German         NaN    NaN


### Join - use when joining on index

In [20]:
df = pd.DataFrame(data=[['Japan', 'Tokyo', 'Japanese']
                        , ['USA', 'Washington DC', 'English']
                        , ['Germany', 'Berlin', 'German']
                       ]
                  , columns=['Country', 'City', 'Language']
                 )
print(df.head())

population = [{'Country': 'USA', 'Population': 10000, 'City': 'NYC'}
              , {'Country': 'England', 'Population': 10, 'City': 'Manchester'}
              , {'Country': 'Mexico', 'Population': 1, 'City': 'Mexico City'}
             ]
df_pop = pd.DataFrame(population)
print('\n{}'.format(df_pop.head()))

df_j = df.join(df_pop
               , how='left'
               , lsuffix='_x'
               , rsuffix='_y'
              )
df_j

   Country           City  Language
0    Japan          Tokyo  Japanese
1      USA  Washington DC   English
2  Germany         Berlin    German

   Country  Population         City
0      USA       10000          NYC
1  England          10   Manchester
2   Mexico           1  Mexico City


Unnamed: 0,Country_x,City_x,Language,Country_y,Population,City_y
0,Japan,Tokyo,Japanese,USA,10000,NYC
1,USA,Washington DC,English,England,10,Manchester
2,Germany,Berlin,German,Mexico,1,Mexico City


### Concatenate - use when appending axis-wise, on index - horizontally

In [21]:
df = pd.DataFrame(data=[['Japan', 'Tokyo']
                        , ['USA', 'Washington DC']
                        , ['Germany', 'Berlin']
                       ]
                  , columns=['Country', 'City']
                  , index=[0,1,2]
                 )
print(df)

df_pop = pd.DataFrame(data=[['USA', 10000]
                        , ['England', 10]
                        , ['Mexico', 1]
                       ]
                      , columns=['Country', 'Population']
                      , index=[1,3,4]
                 )

print('\n{}'.format(df_pop))

# default outer
df_concat = pd.concat([df, df_pop]
                      , axis=1
                     )
print('\n{}'.format(df_concat))

# inner
df_concat = pd.concat([df, df_pop]
                      , axis=1
                      , join='inner'
                     )
print('\n{}'.format(df_concat))

# concatenation is based on index, not columns
df_pop = pd.DataFrame(data=[['USA', 10000]
                        , ['England', 10]
                        , ['Mexico', 1]
                       ]
                      , columns=['Country', 'Population']
                      , index=[1,2,4]
                 )
df_concat = pd.concat([df, df_pop]
                      , axis=1
                      , join='inner'
                     )
print('\n{}'.format(df_concat))

# remove index labels from output
df_concat = pd.concat([df, df_pop]
                      , axis=1
                      , join='inner'
                      , ignore_index=True
                     )
print('\n{}'.format(df_concat))

# specify keys
df_concat = pd.concat([df, df_pop]
                      , axis=1
                      , join='outer'
                      , keys=['a', 'b']
                     )
print('\n{}'.format(df_concat))

   Country           City
0    Japan          Tokyo
1      USA  Washington DC
2  Germany         Berlin

   Country  Population
1      USA       10000
3  England          10
4   Mexico           1

   Country           City  Country  Population
0    Japan          Tokyo      NaN         NaN
1      USA  Washington DC      USA     10000.0
2  Germany         Berlin      NaN         NaN
3      NaN            NaN  England        10.0
4      NaN            NaN   Mexico         1.0

  Country           City Country  Population
1     USA  Washington DC     USA       10000

   Country           City  Country  Population
1      USA  Washington DC      USA       10000
2  Germany         Berlin  England          10

         0              1        2      3
1      USA  Washington DC      USA  10000
2  Germany         Berlin  England     10

         a                       b           
   Country           City  Country Population
0    Japan          Tokyo      NaN        NaN
1      USA  Washingto

### Concatenate - use when appending axis-wise, on index - vertically

In [22]:
df = pd.DataFrame(data=[['Japan', 'Tokyo']
                        , ['USA', 'Washington DC']
                        , ['Germany', 'Berlin']
                       ]
                  , columns=['Country', 'City']
                  , index=[0,1,2]
                 )
print(df)

dfa = pd.DataFrame(data=[['USA', 'NYC',1]
                        , ['England', 'Manchester',2]
                        , ['Mexico', 'Mexico City',3]
                       ]
                      , columns=['Country', 'City','Population']
                      , index=[0,1,2]
                 )

print('\n{}'.format(dfa))

# default outer
df_concat = pd.concat([df, dfa]
                      , axis=0
                      , sort=False
                     )
print('\n{}'.format(df_concat))

# inner
df_concat = pd.concat([df, dfa]
                      , axis=0
                      , join='inner'
                      , sort=False
                     )
print('\n{}'.format(df_concat))

# specify keys
df_concat = pd.concat([df, dfa]
                      , axis=0
                      , join='inner'
                      , sort=False
                      , keys=['a','b']
                     )
print('\n{}'.format(df_concat))

   Country           City
0    Japan          Tokyo
1      USA  Washington DC
2  Germany         Berlin

   Country         City  Population
0      USA          NYC           1
1  England   Manchester           2
2   Mexico  Mexico City           3

   Country           City  Population
0    Japan          Tokyo         NaN
1      USA  Washington DC         NaN
2  Germany         Berlin         NaN
0      USA            NYC         1.0
1  England     Manchester         2.0
2   Mexico    Mexico City         3.0

   Country           City
0    Japan          Tokyo
1      USA  Washington DC
2  Germany         Berlin
0      USA            NYC
1  England     Manchester
2   Mexico    Mexico City

     Country           City
a 0    Japan          Tokyo
  1      USA  Washington DC
  2  Germany         Berlin
b 0      USA            NYC
  1  England     Manchester
  2   Mexico    Mexico City


## Normalize data

In [23]:
df = pd.DataFrame([[100, 10, 30], [120, 30, 40], [130, 10, 30]], columns=['height', 'years', 'age'])

# simple feature scaling
df['height_feature_scaling'] = df['height'] / df['height'].max()

# min-max
df['height_minmax'] = (df['height'] - df['height'].min()) / (df['height'].max() - df['height'].min())

# z score
df['height_z'] = (df['height'] - df['height'].mean()) / df['height'].std()

df

Unnamed: 0,height,years,age,height_feature_scaling,height_minmax,height_z
0,100,10,30,0.769231,0.0,-1.091089
1,120,30,40,0.923077,0.666667,0.218218
2,130,10,30,1.0,1.0,0.872872


### Value_counts

In [24]:
cols = ['Ethnicity', 'Time', 'Time_Type', 'Region', 'Age', 'Value', 'Numerator']
df = pd.read_csv('../data/misc/employment-by-region.csv'
                 , low_memory=False
                 , usecols=cols
                )
df.head()

Unnamed: 0,Ethnicity,Time,Time_Type,Region,Age,Value,Numerator
0,All,2004,year,All,16-24,59.7,3934400
1,All,2004,year,East Midlands,16-24,61.4,300300
2,All,2004,year,East of England,16-24,65.8,371600
3,All,2004,year,London,16-24,47.2,424100
4,All,2004,year,North East,16-24,58.6,176600


In [25]:
df['Age'].value_counts().to_frame()

Unnamed: 0,Age
50-64,7020
16-24,7020
All,7020
25-49,7020


In [26]:
%config Completer.use_jedi = False