# Dataframes contd. 
Creating and updating dataframes.   
Inserting, appending and deleting rows and columns  
Updates will be done in the next session

In [1]:
# Creating an empty df  (no rows/columns)
import pandas as pd
country_df = pd.DataFrame()

In [2]:
type(country_df)

pandas.core.frame.DataFrame

In [3]:
country_df

In [4]:
# Creating a df with only column headers and no rows
data = []
columns = ['column1', 'column2']
country_df = pd.DataFrame(data=data, columns=columns)
country_df

Unnamed: 0,column1,column2


### Creating a df with rows and columns by parsing lists of data

In [5]:
data_ = [['Norway','Oslo'],['Germany','Berlin'],['Spain','Unknown']]
cols = ['column1', 'column2']
country_df = pd.DataFrame(data=data_, columns=cols)
country_df

Unnamed: 0,column1,column2
0,Norway,Oslo
1,Germany,Berlin
2,Spain,Unknown


### Rename the columns

In [6]:
country_df.rename(columns={'column1': 'Country','column2':'Capital'})


Unnamed: 0,Country,Capital
0,Norway,Oslo
1,Germany,Berlin
2,Spain,Unknown


In [7]:
country_df

Unnamed: 0,column1,column2
0,Norway,Oslo
1,Germany,Berlin
2,Spain,Unknown


### Note: For df functions such as rename, drop, drop_duplicates, reset_index etc.,
#### - to actually commit the changes,  do either of the below two (never combine them)
    1) assign the df function to a dataframe variable or
    2) use inplace=True keyword arg 

In [8]:
# modifying the df without inplace=True by assigning to the df variable
country_df = country_df.rename(columns={'column1': 'Country','column2':'Capital'})
country_df

Unnamed: 0,Country,Capital
0,Norway,Oslo
1,Germany,Berlin
2,Spain,Unknown


In [9]:
country_dict = {'Country':['Norway', 'Germany', 'Spain'],
                'Capital': ['Oslo', 'Berlin', 'Unknown']
                }

In [10]:
country_df = pd.DataFrame(country_dict)
country_df

Unnamed: 0,Country,Capital
0,Norway,Oslo
1,Germany,Berlin
2,Spain,Unknown


### len(df) is the no. of rows in a df
### len of a df is always = max(index/row_index) + 1

In [11]:
len(country_df)  

3

### Adding/Appending rows to a df
#### append function - python is deprecating the feature (in the future) so we will try not to use it 
We will be seeing other ways of appending/adding rows to an existing df using mereg and concat functions in the upcoming training notebooks

In [12]:
country_df.loc[2,:]

Country      Spain
Capital    Unknown
Name: 2, dtype: object

In [13]:
country_df.loc[3] = ['USA','Washington D.C']

In [14]:
country_df

Unnamed: 0,Country,Capital
0,Norway,Oslo
1,Germany,Berlin
2,Spain,Unknown
3,USA,Washington D.C


In [15]:
country_df.loc[len(country_df)] = ['USA','Washington D.C']
country_df

Unnamed: 0,Country,Capital
0,Norway,Oslo
1,Germany,Berlin
2,Spain,Unknown
3,USA,Washington D.C
4,USA,Washington D.C


### Using for loop and len(df) to append list of rows to an existing df

In [16]:
row_lst = [['Russia','Moscow'],['Switzerland','Bern'],['Austria','Unknown']]
# country_df.loc[len(country_df)] = ['USA','Washington D.C']
for i in row_lst:
    country_df.loc[len(country_df)] = i
country_df

Unnamed: 0,Country,Capital
0,Norway,Oslo
1,Germany,Berlin
2,Spain,Unknown
3,USA,Washington D.C
4,USA,Washington D.C
5,Russia,Moscow
6,Switzerland,Bern
7,Austria,Unknown


### Adding a column

In [17]:
country_df['Continent'] = None
country_df

Unnamed: 0,Country,Capital,Continent
0,Norway,Oslo,
1,Germany,Berlin,
2,Spain,Unknown,
3,USA,Washington D.C,
4,USA,Washington D.C,
5,Russia,Moscow,
6,Switzerland,Bern,
7,Austria,Unknown,


In [18]:
# accessing/filtering df rows where Country== Norway
country_df.loc[country_df['Country']=='Norway']

Unnamed: 0,Country,Capital,Continent
0,Norway,Oslo,


In [19]:
# The above code is equivalent to
country_df.loc[country_df['Country']=='Norway',:]

Unnamed: 0,Country,Capital,Continent
0,Norway,Oslo,


In [20]:
country_df.loc[country_df['Country']=='USA']

Unnamed: 0,Country,Capital,Continent
3,USA,Washington D.C,
4,USA,Washington D.C,


### Accessing/Filtering df rows where Country belongs to one of the list entries

In [21]:
lst = ['Norway','Germany','Spain']
country_df.loc[country_df['Country'].isin(lst)]

Unnamed: 0,Country,Capital,Continent
0,Norway,Oslo,
1,Germany,Berlin,
2,Spain,Unknown,


In [22]:
country_df.loc[country_df['Country']!='USA']

Unnamed: 0,Country,Capital,Continent
0,Norway,Oslo,
1,Germany,Berlin,
2,Spain,Unknown,
5,Russia,Moscow,
6,Switzerland,Bern,
7,Austria,Unknown,


### Using list negation to access rows 

In [23]:
lst = ['Norway','Germany','Spain']
country_df.loc[~country_df['Country'].isin(lst)] 
# Returns all the rows where Country value is not equal to any of the list entries

Unnamed: 0,Country,Capital,Continent
3,USA,Washington D.C,
4,USA,Washington D.C,
5,Russia,Moscow,
6,Switzerland,Bern,
7,Austria,Unknown,


In [24]:
country_df.loc[country_df['Country']=='Russia']

Unnamed: 0,Country,Capital,Continent
5,Russia,Moscow,


### Fetching the index/indices of a df filter

In [25]:
country_df.loc[country_df['Country']=='Russia'].index

Int64Index([5], dtype='int64')

In [26]:
idx_lst = list(country_df.loc[country_df['Country'].isin(['Russia','USA'])].index)
idx_lst

[3, 4, 5]

### Deleting or dropping df rows using index list

In [27]:
country_df.drop(idx_lst)  # this will have no effect to the actual dataframe as this change have not been committed

Unnamed: 0,Country,Capital,Continent
0,Norway,Oslo,
1,Germany,Berlin,
2,Spain,Unknown,
6,Switzerland,Bern,
7,Austria,Unknown,


In [28]:
country_df

Unnamed: 0,Country,Capital,Continent
0,Norway,Oslo,
1,Germany,Berlin,
2,Spain,Unknown,
3,USA,Washington D.C,
4,USA,Washington D.C,
5,Russia,Moscow,
6,Switzerland,Bern,
7,Austria,Unknown,


In [29]:
country_df.drop(idx_lst, inplace=True)  # This code will affect the actual df since we use inplace=True 
country_df

Unnamed: 0,Country,Capital,Continent
0,Norway,Oslo,
1,Germany,Berlin,
2,Spain,Unknown,
6,Switzerland,Bern,
7,Austria,Unknown,


### Resetting index - in the above output you see the index is unordered due to row deletions

In [30]:
country_df.reset_index(inplace=True)
country_df

Unnamed: 0,index,Country,Capital,Continent
0,0,Norway,Oslo,
1,1,Germany,Berlin,
2,2,Spain,Unknown,
3,6,Switzerland,Bern,
4,7,Austria,Unknown,


### Resetting index drop=True parameter
In the above output you see the old (undordered) index is now become a column which is not required  
We can use drop=True parameter to drop the previously used index

In [31]:
country_df.drop(columns=['index'],inplace=True) # just removing the unwanted index column
country_df.reset_index(drop=True,inplace=True) 
# had we used drop=True in the previous snippet the column 'index' would have not been created at all
country_df

Unnamed: 0,Country,Capital,Continent
0,Norway,Oslo,
1,Germany,Berlin,
2,Spain,Unknown,
3,Switzerland,Bern,
4,Austria,Unknown,


### Deleting or dropping df columns

In [32]:
country_df.drop(columns=['Continent'],inplace=True)

In [33]:
country_df

Unnamed: 0,Country,Capital
0,Norway,Oslo
1,Germany,Berlin
2,Spain,Unknown
3,Switzerland,Bern
4,Austria,Unknown


### Reading a csv from github and dumping the csv data into a df

In [34]:
# actual url from github = 'https://github.com/cs109/2014_data/blob/master/countries.csv'
# you have to remove the 'blob/' and change github.com to 'raw.githubusercontent.com' in the url string
url = "https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv"
sample_df = pd.read_csv(url)
sample_df

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA
2,Benin,AFRICA
3,Botswana,AFRICA
4,Burkina,AFRICA
...,...,...
189,Paraguay,SOUTH AMERICA
190,Peru,SOUTH AMERICA
191,Suriname,SOUTH AMERICA
192,Uruguay,SOUTH AMERICA


# Kaggle.com - Kaggle is a data science competition platform and online community of data scientists and machine learning practitioners owned by Google LLC. 
# It is an ocean of datasets. These datasets can be leveraged for practicing Data analysis, manipulation, Data science or Machine learning