In [434]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Generate 150 random dates
start_date = datetime(2023, 1, 1)
date_list = [start_date + timedelta(days=i) for i in range(150)]
random_dates = random.choices(date_list, k=150)

# --- WEATHER DATASET ---
weather_data = {
    'date': [d.strftime('%Y-%m-%d') if i % 10 != 0 else d.strftime('%d/%m/%Y') for i, d in enumerate(random_dates)],
    'Temp': [round(np.random.uniform(15, 40), 1) if i % 8 != 0 else 'hot' for i in range(150)],
    'humidity(%)': [np.random.randint(20, 100) if i % 7 != 0 else np.nan for i in range(150)],
    'wind_speed': [round(np.random.uniform(0, 20), 2) if i % 5 != 0 else "fast" for i in range(150)],
    'city': [random.choice(['New York', 'Los Angeles', 'Chicago', np.nan, '']) for _ in range(150)],
}

weather_df = pd.DataFrame(weather_data)

# Add some duplicate rows
weather_df = pd.concat([weather_df, weather_df.iloc[0:5]], ignore_index=True)

# --- SALES DATASET ---
sales_data = {
    'date': [d.strftime('%Y-%m-%d') if i % 6 != 0 else d.strftime('%B %d, %Y') for i, d in enumerate(random_dates)],
    'sales_amount': [round(np.random.uniform(1000, 10000), 2) if i % 10 != 0 else 'high' for i in range(150)],
    'product_id': [random.choice(['P001', 'P002', 'P003', np.nan, 'p001']) for _ in range(150)],
    'units_sold': [np.random.randint(1, 50) if i % 9 != 0 else None for i in range(150)],
    'store_location': [random.choice(['New York', 'Chicago', 'Houston', '', np.nan]) for _ in range(150)],
}

sales_df = pd.DataFrame(sales_data)

# Add some duplicate rows
sales_df = pd.concat([sales_df, sales_df.iloc[2:4]], ignore_index=True)

# --- SAVE TO CSV FILES ---
weather_df.to_csv("weather.csv", index=False)
sales_df.to_csv("sales.csv", index=False)

print("Generated messy 'weather.csv' and 'sales.csv' with 150+ rows each.")

Generated messy 'weather.csv' and 'sales.csv' with 150+ rows each.


In [435]:
sales_data= pd.read_csv('/content/sales.csv')
weather_data = pd.read_csv('/content/weather.csv')

In [436]:
sales_data

Unnamed: 0,date,sales_amount,product_id,units_sold,store_location
0,"April 06, 2023",high,P003,,
1,2023-01-04,6727.63,P003,28.0,
2,2023-02-11,8208.54,P002,36.0,New York
3,2023-02-03,7094.52,,26.0,Houston
4,2023-04-21,6160.3,p001,8.0,Houston
...,...,...,...,...,...
147,2023-04-20,4550.7,p001,34.0,Houston
148,2023-01-31,1960.8,p001,6.0,Houston
149,2023-02-16,4020.94,P003,2.0,
150,2023-02-11,8208.54,P002,36.0,New York


In [437]:
weather_data

Unnamed: 0,date,Temp,humidity(%),wind_speed,city
0,06/04/2023,hot,,fast,Chicago
1,2023-01-04,24.4,56.0,15.37,
2,2023-02-11,38.8,70.0,0.87,
3,2023-02-03,33.3,63.0,19.89,
4,2023-04-21,30.0,43.0,9.4,New York
...,...,...,...,...,...
150,06/04/2023,hot,,fast,Chicago
151,2023-01-04,24.4,56.0,15.37,
152,2023-02-11,38.8,70.0,0.87,
153,2023-02-03,33.3,63.0,19.89,


In [438]:
weather_data.shape

(155, 5)

In [439]:
sales_data.shape

(152, 5)

In [440]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         155 non-null    object 
 1   Temp         155 non-null    object 
 2   humidity(%)  132 non-null    float64
 3   wind_speed   155 non-null    object 
 4   city         92 non-null     object 
dtypes: float64(1), object(4)
memory usage: 6.2+ KB


In [441]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            152 non-null    object 
 1   sales_amount    152 non-null    object 
 2   product_id      128 non-null    object 
 3   units_sold      135 non-null    float64
 4   store_location  78 non-null     object 
dtypes: float64(1), object(4)
memory usage: 6.1+ KB


In [442]:
weather_data.isnull().sum()

Unnamed: 0,0
date,0
Temp,0
humidity(%),23
wind_speed,0
city,63


In [443]:
sales_data.isnull().sum()

Unnamed: 0,0
date,0
sales_amount,0
product_id,24
units_sold,17
store_location,74


In [444]:
weather_data['humidity(%)'].dtypes

dtype('float64')

In [445]:
mean_col_A = weather_data['humidity(%)'].mean()


In [446]:
weather_data['humidity(%)'].fillna(mean_col_A, inplace=False)
weather_data['humidity(%)']

Unnamed: 0,humidity(%)
0,
1,56.0
2,70.0
3,63.0
4,43.0
...,...
150,
151,56.0
152,70.0
153,63.0


In [447]:
weather_data.isna().sum()

Unnamed: 0,0
date,0
Temp,0
humidity(%),23
wind_speed,0
city,63


In [448]:
weather_data.isna().sum()

Unnamed: 0,0
date,0
Temp,0
humidity(%),23
wind_speed,0
city,63


In [449]:
weather_data.columns

Index(['date', 'Temp', 'humidity(%)', 'wind_speed', 'city'], dtype='object')

In [450]:
mean_col_B = weather_data['city'].mode()

In [451]:
weather_data['city'] = weather_data['city'].fillna('chicgo',inplace = False)
weather_data['city']

Unnamed: 0,city
0,Chicago
1,chicgo
2,chicgo
3,chicgo
4,New York
...,...
150,Chicago
151,chicgo
152,chicgo
153,chicgo


In [452]:
weather_data

Unnamed: 0,date,Temp,humidity(%),wind_speed,city
0,06/04/2023,hot,,fast,Chicago
1,2023-01-04,24.4,56.0,15.37,chicgo
2,2023-02-11,38.8,70.0,0.87,chicgo
3,2023-02-03,33.3,63.0,19.89,chicgo
4,2023-04-21,30.0,43.0,9.4,New York
...,...,...,...,...,...
150,06/04/2023,hot,,fast,Chicago
151,2023-01-04,24.4,56.0,15.37,chicgo
152,2023-02-11,38.8,70.0,0.87,chicgo
153,2023-02-03,33.3,63.0,19.89,chicgo


In [453]:
weather_data.isna().sum()

Unnamed: 0,0
date,0
Temp,0
humidity(%),23
wind_speed,0
city,0


In [454]:
sales_data


Unnamed: 0,date,sales_amount,product_id,units_sold,store_location
0,"April 06, 2023",high,P003,,
1,2023-01-04,6727.63,P003,28.0,
2,2023-02-11,8208.54,P002,36.0,New York
3,2023-02-03,7094.52,,26.0,Houston
4,2023-04-21,6160.3,p001,8.0,Houston
...,...,...,...,...,...
147,2023-04-20,4550.7,p001,34.0,Houston
148,2023-01-31,1960.8,p001,6.0,Houston
149,2023-02-16,4020.94,P003,2.0,
150,2023-02-11,8208.54,P002,36.0,New York


In [455]:
sales_data.isna().sum()

Unnamed: 0,0
date,0
sales_amount,0
product_id,24
units_sold,17
store_location,74


In [456]:
sales_data['product_id'] = sales_data['product_id'].fillna(method = 'ffill')
sales_data['product_id']

  sales_data['product_id'] = sales_data['product_id'].fillna(method = 'ffill')


Unnamed: 0,product_id
0,P003
1,P003
2,P002
3,P002
4,p001
...,...
147,p001
148,p001
149,P003
150,P002


In [457]:
sales_data.isna().sum()

Unnamed: 0,0
date,0
sales_amount,0
product_id,0
units_sold,17
store_location,74


In [458]:
mean_col_C = sales_data['units_sold'].mean()
mean_col_C

np.float64(23.985185185185184)

In [459]:
sales_data['units_sold'].fillna(sales_data['units_sold'].mean(), inplace=True)
sales_data

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sales_data['units_sold'].fillna(sales_data['units_sold'].mean(), inplace=True)


Unnamed: 0,date,sales_amount,product_id,units_sold,store_location
0,"April 06, 2023",high,P003,23.985185,
1,2023-01-04,6727.63,P003,28.000000,
2,2023-02-11,8208.54,P002,36.000000,New York
3,2023-02-03,7094.52,P002,26.000000,Houston
4,2023-04-21,6160.3,p001,8.000000,Houston
...,...,...,...,...,...
147,2023-04-20,4550.7,p001,34.000000,Houston
148,2023-01-31,1960.8,p001,6.000000,Houston
149,2023-02-16,4020.94,P003,2.000000,
150,2023-02-11,8208.54,P002,36.000000,New York


In [460]:
sales_data['store_location']  = sales_data['store_location'].fillna('New York')
sales_data

Unnamed: 0,date,sales_amount,product_id,units_sold,store_location
0,"April 06, 2023",high,P003,23.985185,New York
1,2023-01-04,6727.63,P003,28.000000,New York
2,2023-02-11,8208.54,P002,36.000000,New York
3,2023-02-03,7094.52,P002,26.000000,Houston
4,2023-04-21,6160.3,p001,8.000000,Houston
...,...,...,...,...,...
147,2023-04-20,4550.7,p001,34.000000,Houston
148,2023-01-31,1960.8,p001,6.000000,Houston
149,2023-02-16,4020.94,P003,2.000000,New York
150,2023-02-11,8208.54,P002,36.000000,New York


In [461]:
weather_data

Unnamed: 0,date,Temp,humidity(%),wind_speed,city
0,06/04/2023,hot,,fast,Chicago
1,2023-01-04,24.4,56.0,15.37,chicgo
2,2023-02-11,38.8,70.0,0.87,chicgo
3,2023-02-03,33.3,63.0,19.89,chicgo
4,2023-04-21,30.0,43.0,9.4,New York
...,...,...,...,...,...
150,06/04/2023,hot,,fast,Chicago
151,2023-01-04,24.4,56.0,15.37,chicgo
152,2023-02-11,38.8,70.0,0.87,chicgo
153,2023-02-03,33.3,63.0,19.89,chicgo


In [462]:
weather_data['Temp'] = weather_data['Temp'].replace('hot',25.5)
weather_data['Temp']

Unnamed: 0,Temp
0,25.5
1,24.4
2,38.8
3,33.3
4,30.0
...,...
150,25.5
151,24.4
152,38.8
153,33.3


In [463]:
weather_data['wind_speed'] = weather_data['wind_speed'].replace('fast', 10)
weather_data

Unnamed: 0,date,Temp,humidity(%),wind_speed,city
0,06/04/2023,25.5,,10,Chicago
1,2023-01-04,24.4,56.0,15.37,chicgo
2,2023-02-11,38.8,70.0,0.87,chicgo
3,2023-02-03,33.3,63.0,19.89,chicgo
4,2023-04-21,30.0,43.0,9.4,New York
...,...,...,...,...,...
150,06/04/2023,25.5,,10,Chicago
151,2023-01-04,24.4,56.0,15.37,chicgo
152,2023-02-11,38.8,70.0,0.87,chicgo
153,2023-02-03,33.3,63.0,19.89,chicgo


In [464]:
weather_data

Unnamed: 0,date,Temp,humidity(%),wind_speed,city
0,06/04/2023,25.5,,10,Chicago
1,2023-01-04,24.4,56.0,15.37,chicgo
2,2023-02-11,38.8,70.0,0.87,chicgo
3,2023-02-03,33.3,63.0,19.89,chicgo
4,2023-04-21,30.0,43.0,9.4,New York
...,...,...,...,...,...
150,06/04/2023,25.5,,10,Chicago
151,2023-01-04,24.4,56.0,15.37,chicgo
152,2023-02-11,38.8,70.0,0.87,chicgo
153,2023-02-03,33.3,63.0,19.89,chicgo


In [465]:
sales_data

Unnamed: 0,date,sales_amount,product_id,units_sold,store_location
0,"April 06, 2023",high,P003,23.985185,New York
1,2023-01-04,6727.63,P003,28.000000,New York
2,2023-02-11,8208.54,P002,36.000000,New York
3,2023-02-03,7094.52,P002,26.000000,Houston
4,2023-04-21,6160.3,p001,8.000000,Houston
...,...,...,...,...,...
147,2023-04-20,4550.7,p001,34.000000,Houston
148,2023-01-31,1960.8,p001,6.000000,Houston
149,2023-02-16,4020.94,P003,2.000000,New York
150,2023-02-11,8208.54,P002,36.000000,New York


In [466]:
sales_data['sales_amount'] = sales_data['sales_amount'].replace('high',1000.0)
sales_data

Unnamed: 0,date,sales_amount,product_id,units_sold,store_location
0,"April 06, 2023",1000.0,P003,23.985185,New York
1,2023-01-04,6727.63,P003,28.000000,New York
2,2023-02-11,8208.54,P002,36.000000,New York
3,2023-02-03,7094.52,P002,26.000000,Houston
4,2023-04-21,6160.3,p001,8.000000,Houston
...,...,...,...,...,...
147,2023-04-20,4550.7,p001,34.000000,Houston
148,2023-01-31,1960.8,p001,6.000000,Houston
149,2023-02-16,4020.94,P003,2.000000,New York
150,2023-02-11,8208.54,P002,36.000000,New York


In [467]:
grouped_data = weather_data.groupby(['date'])['humidity(%)'].mean()
grouped_data

Unnamed: 0_level_0,humidity(%)
date,Unnamed: 1_level_1
01/02/2023,
01/05/2023,21.0
02/01/2023,55.0
02/02/2023,81.0
02/05/2023,51.0
...,...
2023-05-30,35.5
25/02/2023,99.0
25/04/2023,27.0
26/03/2023,67.0


In [468]:
merge_data = weather_data.merge(sales_data)
merge_data


Unnamed: 0,date,Temp,humidity(%),wind_speed,city,sales_amount,product_id,units_sold,store_location
0,2023-01-04,24.4,56.0,15.37,chicgo,6727.63,P003,28.0,New York
1,2023-02-11,38.8,70.0,0.87,chicgo,8208.54,P002,36.0,New York
2,2023-02-11,38.8,70.0,0.87,chicgo,3144.13,P001,22.0,Chicago
3,2023-02-11,38.8,70.0,0.87,chicgo,8208.54,P002,36.0,New York
4,2023-02-03,33.3,63.0,19.89,chicgo,7094.52,P002,26.0,Houston
...,...,...,...,...,...,...,...,...,...
219,2023-02-11,38.8,70.0,0.87,chicgo,8208.54,P002,36.0,New York
220,2023-02-03,33.3,63.0,19.89,chicgo,7094.52,P002,26.0,Houston
221,2023-02-03,33.3,63.0,19.89,chicgo,1350.67,p001,13.0,Chicago
222,2023-02-03,33.3,63.0,19.89,chicgo,7094.52,P002,26.0,Houston


In [469]:
merge_data['sales_amount'] = merge_data['sales_amount'].astype(float).astype(int)
merge_data

Unnamed: 0,date,Temp,humidity(%),wind_speed,city,sales_amount,product_id,units_sold,store_location
0,2023-01-04,24.4,56.0,15.37,chicgo,6727,P003,28.0,New York
1,2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36.0,New York
2,2023-02-11,38.8,70.0,0.87,chicgo,3144,P001,22.0,Chicago
3,2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36.0,New York
4,2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26.0,Houston
...,...,...,...,...,...,...,...,...,...
219,2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36.0,New York
220,2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26.0,Houston
221,2023-02-03,33.3,63.0,19.89,chicgo,1350,p001,13.0,Chicago
222,2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26.0,Houston


In [470]:
merge_data['units_sold'] = merge_data['units_sold'].astype(float).astype(int)
merge_data

Unnamed: 0,date,Temp,humidity(%),wind_speed,city,sales_amount,product_id,units_sold,store_location
0,2023-01-04,24.4,56.0,15.37,chicgo,6727,P003,28,New York
1,2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36,New York
2,2023-02-11,38.8,70.0,0.87,chicgo,3144,P001,22,Chicago
3,2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36,New York
4,2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26,Houston
...,...,...,...,...,...,...,...,...,...
219,2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36,New York
220,2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26,Houston
221,2023-02-03,33.3,63.0,19.89,chicgo,1350,p001,13,Chicago
222,2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26,Houston


In [471]:
merge_data['revenue'] = merge_data['sales_amount'] * merge_data['units_sold']
merge_data

Unnamed: 0,date,Temp,humidity(%),wind_speed,city,sales_amount,product_id,units_sold,store_location,revenue
0,2023-01-04,24.4,56.0,15.37,chicgo,6727,P003,28,New York,188356
1,2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36,New York,295488
2,2023-02-11,38.8,70.0,0.87,chicgo,3144,P001,22,Chicago,69168
3,2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36,New York,295488
4,2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26,Houston,184444
...,...,...,...,...,...,...,...,...,...,...
219,2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36,New York,295488
220,2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26,Houston,184444
221,2023-02-03,33.3,63.0,19.89,chicgo,1350,p001,13,Chicago,17550
222,2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26,Houston,184444


In [472]:
max_revenue = merge_data.groupby(['product_id'])['revenue'].max()
max_revenue

Unnamed: 0_level_0,revenue
product_id,Unnamed: 1_level_1
P001,443163
P002,352055
P003,331248
p001,310245


In [473]:
#convert to csv

cleaned_data = merge_data.to_csv('cleaned_data')


In [474]:
cleaned_data_json = merge_data.to_json('cleaned_data_json')


In [475]:
rolling_statistics = merge_data[ 'Temp'].rolling(window = 3).mean()
rolling_statistics

Unnamed: 0,Temp
0,
1,
2,34.000000
3,38.800000
4,36.966667
...,...
219,38.800000
220,36.966667
221,35.133333
222,33.300000


In [476]:
merge_data

Unnamed: 0,date,Temp,humidity(%),wind_speed,city,sales_amount,product_id,units_sold,store_location,revenue
0,2023-01-04,24.4,56.0,15.37,chicgo,6727,P003,28,New York,188356
1,2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36,New York,295488
2,2023-02-11,38.8,70.0,0.87,chicgo,3144,P001,22,Chicago,69168
3,2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36,New York,295488
4,2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26,Houston,184444
...,...,...,...,...,...,...,...,...,...,...
219,2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36,New York,295488
220,2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26,Houston,184444
221,2023-02-03,33.3,63.0,19.89,chicgo,1350,p001,13,Chicago,17550
222,2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26,Houston,184444


In [477]:
indexed = merge_data.set_index('date')


In [478]:
indexed

Unnamed: 0_level_0,Temp,humidity(%),wind_speed,city,sales_amount,product_id,units_sold,store_location,revenue
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-01-04,24.4,56.0,15.37,chicgo,6727,P003,28,New York,188356
2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36,New York,295488
2023-02-11,38.8,70.0,0.87,chicgo,3144,P001,22,Chicago,69168
2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36,New York,295488
2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26,Houston,184444
...,...,...,...,...,...,...,...,...,...
2023-02-11,38.8,70.0,0.87,chicgo,8208,P002,36,New York,295488
2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26,Houston,184444
2023-02-03,33.3,63.0,19.89,chicgo,1350,p001,13,Chicago,17550
2023-02-03,33.3,63.0,19.89,chicgo,7094,P002,26,Houston,184444


In [479]:
merge_data['date'] = pd.to_datetime(merge_data['date'])
merge_data = merge_data.set_index('date')

In [480]:

weekly_revenue = merge_data.resample('W')['revenue'].sum()
weekly_revenue

Unnamed: 0_level_0,revenue
date,Unnamed: 1_level_1
2023-01-01,810306
2023-01-08,1598478
2023-01-15,2173511
2023-01-22,446519
2023-01-29,617033
2023-02-05,2635045
2023-02-12,6115139
2023-02-19,1002889
2023-02-26,874550
2023-03-05,1456945


In [481]:
food_production_data = pd.read_csv('/content/world food production.csv')
food_production_data

Unnamed: 0,Entity,Year,Maize Production (tonnes),Rice Production ( tonnes),Yams Production (tonnes),Wheat Production (tonnes),Tomatoes Production (tonnes),Tea Production ( tonnes ),Sweet potatoes Production (tonnes),Sunflower seed Production (tonnes),...,Oranges Production (tonnes),"Peas, dry Production ( tonnes)",Palm oil Production (tonnes),Grapes Production (tonnes),"Coffee, green Production ( tonnes)",Cocoa beans Production (tonnes),"Meat, chicken Production (tonnes)",Bananas Production ( tonnes),Avocados Production (tonnes),Apples Production (tonnes)
0,Afghanistan,1961,700000.0,319000.00,7.467702e+06,2279000.00,1873812.000,56315.0,3270871.000,12000.000,...,10100.000,232910.000,1131882.00,2.250000e+05,870970.000,835368.000,5600.00,3139079.000,63439.000,15100.000
1,Afghanistan,1962,700000.0,319000.00,7.420515e+06,2279000.00,2044797.000,61519.0,3562524.000,12800.000,...,10100.000,259412.000,1111006.00,2.250000e+05,883512.000,867170.000,6000.00,3181580.000,65118.000,15100.000
2,Afghanistan,1963,713000.0,319000.00,8.479074e+06,1947000.00,2096077.000,63596.0,3409916.000,12800.000,...,10100.000,251529.000,1145004.00,2.250000e+05,996674.000,922621.000,6160.00,3304256.000,61760.000,15100.000
3,Afghanistan,1964,720000.0,380000.00,9.113779e+06,2230000.00,2388264.000,66604.0,3229336.000,12800.000,...,12400.000,247556.000,1160831.00,2.650000e+05,1162048.000,1190061.000,6400.00,3392527.000,62759.000,18400.000
4,Afghanistan,1965,720000.0,380000.00,1.006791e+07,2282000.00,2559608.000,72418.0,3169104.000,13200.000,...,13700.000,266947.000,1138860.00,2.870000e+05,1075084.000,874245.000,6800.00,3450849.000,66269.000,20400.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11907,Zimbabwe,2017,1532572.0,20502618.00,2.673608e+05,30991516.16,324020.864,83904.6,874.408,4077308.066,...,129055.800,40749.600,661978.98,7.018985e+06,63.200,34945.000,75000.00,1267.200,145042.096,1579154.000
11908,Zimbabwe,2018,1560100.0,31871776.60,2.667498e+05,16884482.98,309316.252,66842.6,1383.792,2107639.306,...,168074.000,21606.400,485223.38,3.415107e+06,68.000,34659.800,80357.14,1016.776,143384.384,907419.000
11909,Zimbabwe,2019,773976.0,3382151.40,4.790022e+05,64989733.20,540128.592,9854.4,298831.560,3320915.400,...,1134136.812,45436.400,361128.20,9.645766e+06,30401.400,20961.132,79790.97,195845.198,525095.272,3117813.560
11910,Zimbabwe,2020,1202347.0,3535631.80,7.437612e+05,43020887.20,5635740.200,276279.0,1727311.800,983256.000,...,72179.400,12023.200,279613.40,1.131937e+07,574723.374,158022.122,8000.00,1523258.600,28946.840,7994169.220


In [500]:
index_data1 = pd.read_csv('/content/index.csv')
index_data1

Unnamed: 0,date,datetime,cash_type,card,money,coffee_name
0,2024-03-01,2024-03-01 10:15:50.520,card,ANON-0000-0000-0001,38.70,Latte
1,2024-03-01,2024-03-01 12:19:22.539,card,ANON-0000-0000-0002,38.70,Hot Chocolate
2,2024-03-01,2024-03-01 12:20:18.089,card,ANON-0000-0000-0002,38.70,Hot Chocolate
3,2024-03-01,2024-03-01 13:46:33.006,card,ANON-0000-0000-0003,28.90,Americano
4,2024-03-01,2024-03-01 13:48:14.626,card,ANON-0000-0000-0004,38.70,Latte
...,...,...,...,...,...,...
1128,2024-07-31,2024-07-31 20:53:35.077,card,ANON-0000-0000-0443,23.02,Cortado
1129,2024-07-31,2024-07-31 20:59:25.013,card,ANON-0000-0000-0040,27.92,Americano with Milk
1130,2024-07-31,2024-07-31 21:26:26.000,card,ANON-0000-0000-0444,32.82,Latte
1131,2024-07-31,2024-07-31 21:54:11.824,card,ANON-0000-0000-0445,32.82,Latte


In [483]:
food_production_data.shape

(11912, 24)

In [501]:
index_data1.shape

(1133, 6)

In [485]:
food_production_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11912 entries, 0 to 11911
Data columns (total 24 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Entity                               11912 non-null  object 
 1   Year                                 11912 non-null  int64  
 2   Maize Production (tonnes)            11912 non-null  float64
 3   Rice  Production ( tonnes)           11912 non-null  float64
 4   Yams  Production (tonnes)            11912 non-null  float64
 5   Wheat Production (tonnes)            11912 non-null  float64
 6   Tomatoes Production (tonnes)         11912 non-null  float64
 7   Tea  Production ( tonnes )           11912 non-null  float64
 8   Sweet potatoes  Production (tonnes)  11912 non-null  float64
 9   Sunflower seed  Production (tonnes)  11912 non-null  float64
 10  Sugar cane Production (tonnes)       11912 non-null  float64
 11  Soybeans  Production (tonnes

In [486]:
food_production_data.isnull().sum()

Unnamed: 0,0
Entity,0
Year,0
Maize Production (tonnes),0
Rice Production ( tonnes),0
Yams Production (tonnes),0
Wheat Production (tonnes),0
Tomatoes Production (tonnes),0
Tea Production ( tonnes ),0
Sweet potatoes Production (tonnes),0
Sunflower seed Production (tonnes),0


In [487]:
food_production_data.dtypes

Unnamed: 0,0
Entity,object
Year,int64
Maize Production (tonnes),float64
Rice Production ( tonnes),float64
Yams Production (tonnes),float64
Wheat Production (tonnes),float64
Tomatoes Production (tonnes),float64
Tea Production ( tonnes ),float64
Sweet potatoes Production (tonnes),float64
Sunflower seed Production (tonnes),float64


In [488]:
food_production_data

Unnamed: 0,Entity,Year,Maize Production (tonnes),Rice Production ( tonnes),Yams Production (tonnes),Wheat Production (tonnes),Tomatoes Production (tonnes),Tea Production ( tonnes ),Sweet potatoes Production (tonnes),Sunflower seed Production (tonnes),...,Oranges Production (tonnes),"Peas, dry Production ( tonnes)",Palm oil Production (tonnes),Grapes Production (tonnes),"Coffee, green Production ( tonnes)",Cocoa beans Production (tonnes),"Meat, chicken Production (tonnes)",Bananas Production ( tonnes),Avocados Production (tonnes),Apples Production (tonnes)
0,Afghanistan,1961,700000.0,319000.00,7.467702e+06,2279000.00,1873812.000,56315.0,3270871.000,12000.000,...,10100.000,232910.000,1131882.00,2.250000e+05,870970.000,835368.000,5600.00,3139079.000,63439.000,15100.000
1,Afghanistan,1962,700000.0,319000.00,7.420515e+06,2279000.00,2044797.000,61519.0,3562524.000,12800.000,...,10100.000,259412.000,1111006.00,2.250000e+05,883512.000,867170.000,6000.00,3181580.000,65118.000,15100.000
2,Afghanistan,1963,713000.0,319000.00,8.479074e+06,1947000.00,2096077.000,63596.0,3409916.000,12800.000,...,10100.000,251529.000,1145004.00,2.250000e+05,996674.000,922621.000,6160.00,3304256.000,61760.000,15100.000
3,Afghanistan,1964,720000.0,380000.00,9.113779e+06,2230000.00,2388264.000,66604.0,3229336.000,12800.000,...,12400.000,247556.000,1160831.00,2.650000e+05,1162048.000,1190061.000,6400.00,3392527.000,62759.000,18400.000
4,Afghanistan,1965,720000.0,380000.00,1.006791e+07,2282000.00,2559608.000,72418.0,3169104.000,13200.000,...,13700.000,266947.000,1138860.00,2.870000e+05,1075084.000,874245.000,6800.00,3450849.000,66269.000,20400.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11907,Zimbabwe,2017,1532572.0,20502618.00,2.673608e+05,30991516.16,324020.864,83904.6,874.408,4077308.066,...,129055.800,40749.600,661978.98,7.018985e+06,63.200,34945.000,75000.00,1267.200,145042.096,1579154.000
11908,Zimbabwe,2018,1560100.0,31871776.60,2.667498e+05,16884482.98,309316.252,66842.6,1383.792,2107639.306,...,168074.000,21606.400,485223.38,3.415107e+06,68.000,34659.800,80357.14,1016.776,143384.384,907419.000
11909,Zimbabwe,2019,773976.0,3382151.40,4.790022e+05,64989733.20,540128.592,9854.4,298831.560,3320915.400,...,1134136.812,45436.400,361128.20,9.645766e+06,30401.400,20961.132,79790.97,195845.198,525095.272,3117813.560
11910,Zimbabwe,2020,1202347.0,3535631.80,7.437612e+05,43020887.20,5635740.200,276279.0,1727311.800,983256.000,...,72179.400,12023.200,279613.40,1.131937e+07,574723.374,158022.122,8000.00,1523258.600,28946.840,7994169.220


In [502]:
index_data1

Unnamed: 0,date,datetime,cash_type,card,money,coffee_name
0,2024-03-01,2024-03-01 10:15:50.520,card,ANON-0000-0000-0001,38.70,Latte
1,2024-03-01,2024-03-01 12:19:22.539,card,ANON-0000-0000-0002,38.70,Hot Chocolate
2,2024-03-01,2024-03-01 12:20:18.089,card,ANON-0000-0000-0002,38.70,Hot Chocolate
3,2024-03-01,2024-03-01 13:46:33.006,card,ANON-0000-0000-0003,28.90,Americano
4,2024-03-01,2024-03-01 13:48:14.626,card,ANON-0000-0000-0004,38.70,Latte
...,...,...,...,...,...,...
1128,2024-07-31,2024-07-31 20:53:35.077,card,ANON-0000-0000-0443,23.02,Cortado
1129,2024-07-31,2024-07-31 20:59:25.013,card,ANON-0000-0000-0040,27.92,Americano with Milk
1130,2024-07-31,2024-07-31 21:26:26.000,card,ANON-0000-0000-0444,32.82,Latte
1131,2024-07-31,2024-07-31 21:54:11.824,card,ANON-0000-0000-0445,32.82,Latte


In [530]:
index_data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1133 entries, 0 to 1132
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         1133 non-null   object 
 1   datetime     1133 non-null   object 
 2   cash_type    1133 non-null   object 
 3   card         1133 non-null   object 
 4   money        1133 non-null   float64
 5   coffee_name  1133 non-null   object 
 6   card_name    1133 non-null   object 
 7   card_no      1133 non-null   object 
dtypes: float64(1), object(7)
memory usage: 70.9+ KB


In [531]:
index_data1.describe()

Unnamed: 0,money
count,1133.0
mean,33.105808
std,5.035366
min,18.12
25%,28.9
50%,32.82
75%,37.72
max,40.0


In [503]:
index_data1.isnull().sum()

Unnamed: 0,0
date,0
datetime,0
cash_type,0
card,89
money,0
coffee_name,0


In [505]:
index_data1['card'] = index_data1['card'].fillna(method = 'ffill')
index_data1.isnull().sum()

  index_data1['card'] = index_data1['card'].fillna(method = 'ffill')


Unnamed: 0,0
date,0
datetime,0
cash_type,0
card,0
money,0
coffee_name,0


In [513]:
index_data1['card_no'] = index_data1['card'].str.split('-').str[-1]
index_data1

Unnamed: 0,date,datetime,cash_type,card,money,coffee_name,card_type,card_name,card_no
0,2024-03-01,2024-03-01 10:15:50.520,card,ANON-0000-0000-0001,38.70,Latte,0001,ANON,0001
1,2024-03-01,2024-03-01 12:19:22.539,card,ANON-0000-0000-0002,38.70,Hot Chocolate,0002,ANON,0002
2,2024-03-01,2024-03-01 12:20:18.089,card,ANON-0000-0000-0002,38.70,Hot Chocolate,0002,ANON,0002
3,2024-03-01,2024-03-01 13:46:33.006,card,ANON-0000-0000-0003,28.90,Americano,0003,ANON,0003
4,2024-03-01,2024-03-01 13:48:14.626,card,ANON-0000-0000-0004,38.70,Latte,0004,ANON,0004
...,...,...,...,...,...,...,...,...,...
1128,2024-07-31,2024-07-31 20:53:35.077,card,ANON-0000-0000-0443,23.02,Cortado,0443,ANON,0443
1129,2024-07-31,2024-07-31 20:59:25.013,card,ANON-0000-0000-0040,27.92,Americano with Milk,0040,ANON,0040
1130,2024-07-31,2024-07-31 21:26:26.000,card,ANON-0000-0000-0444,32.82,Latte,0444,ANON,0444
1131,2024-07-31,2024-07-31 21:54:11.824,card,ANON-0000-0000-0445,32.82,Latte,0445,ANON,0445


In [508]:
 index_data1.dtypes

Unnamed: 0,0
date,object
datetime,object
cash_type,object
card,object
money,float64
coffee_name,object
card_type,object


In [509]:
index_data1['card_name'] = index_data1['card'].str.split('-').str[0]
index_data1

Unnamed: 0,date,datetime,cash_type,card,money,coffee_name,card_type,card_name
0,2024-03-01,2024-03-01 10:15:50.520,card,ANON-0000-0000-0001,38.70,Latte,0001,ANON
1,2024-03-01,2024-03-01 12:19:22.539,card,ANON-0000-0000-0002,38.70,Hot Chocolate,0002,ANON
2,2024-03-01,2024-03-01 12:20:18.089,card,ANON-0000-0000-0002,38.70,Hot Chocolate,0002,ANON
3,2024-03-01,2024-03-01 13:46:33.006,card,ANON-0000-0000-0003,28.90,Americano,0003,ANON
4,2024-03-01,2024-03-01 13:48:14.626,card,ANON-0000-0000-0004,38.70,Latte,0004,ANON
...,...,...,...,...,...,...,...,...
1128,2024-07-31,2024-07-31 20:53:35.077,card,ANON-0000-0000-0443,23.02,Cortado,0443,ANON
1129,2024-07-31,2024-07-31 20:59:25.013,card,ANON-0000-0000-0040,27.92,Americano with Milk,0040,ANON
1130,2024-07-31,2024-07-31 21:26:26.000,card,ANON-0000-0000-0444,32.82,Latte,0444,ANON
1131,2024-07-31,2024-07-31 21:54:11.824,card,ANON-0000-0000-0445,32.82,Latte,0445,ANON


In [514]:
index_data1  = index_data1.replace({'card_type','card_no'})
index_data1

  index_data1  = index_data1.replace({'card_type','card_no'})


Unnamed: 0,date,datetime,cash_type,card,money,coffee_name,card_type,card_name,card_no
0,2024-03-01,2024-03-01 10:15:50.520,card,ANON-0000-0000-0001,38.70,Latte,0001,ANON,0001
1,2024-03-01,2024-03-01 12:19:22.539,card,ANON-0000-0000-0002,38.70,Hot Chocolate,0002,ANON,0002
2,2024-03-01,2024-03-01 12:20:18.089,card,ANON-0000-0000-0002,38.70,Hot Chocolate,0002,ANON,0002
3,2024-03-01,2024-03-01 13:46:33.006,card,ANON-0000-0000-0003,28.90,Americano,0003,ANON,0003
4,2024-03-01,2024-03-01 13:48:14.626,card,ANON-0000-0000-0004,38.70,Latte,0004,ANON,0004
...,...,...,...,...,...,...,...,...,...
1128,2024-07-31,2024-07-31 20:53:35.077,card,ANON-0000-0000-0443,23.02,Cortado,0443,ANON,0443
1129,2024-07-31,2024-07-31 20:59:25.013,card,ANON-0000-0000-0040,27.92,Americano with Milk,0040,ANON,0040
1130,2024-07-31,2024-07-31 21:26:26.000,card,ANON-0000-0000-0444,32.82,Latte,0444,ANON,0444
1131,2024-07-31,2024-07-31 21:54:11.824,card,ANON-0000-0000-0445,32.82,Latte,0445,ANON,0445


In [521]:
index_data1

Unnamed: 0,date,datetime,cash_type,card,money,coffee_name,card_name,card_no
0,2024-03-01,2024-03-01 10:15:50.520,card,ANON-0000-0000-0001,38.70,Latte,ANON,0001
1,2024-03-01,2024-03-01 12:19:22.539,card,ANON-0000-0000-0002,38.70,Hot Chocolate,ANON,0002
2,2024-03-01,2024-03-01 12:20:18.089,card,ANON-0000-0000-0002,38.70,Hot Chocolate,ANON,0002
3,2024-03-01,2024-03-01 13:46:33.006,card,ANON-0000-0000-0003,28.90,Americano,ANON,0003
4,2024-03-01,2024-03-01 13:48:14.626,card,ANON-0000-0000-0004,38.70,Latte,ANON,0004
...,...,...,...,...,...,...,...,...
1128,2024-07-31,2024-07-31 20:53:35.077,card,ANON-0000-0000-0443,23.02,Cortado,ANON,0443
1129,2024-07-31,2024-07-31 20:59:25.013,card,ANON-0000-0000-0040,27.92,Americano with Milk,ANON,0040
1130,2024-07-31,2024-07-31 21:26:26.000,card,ANON-0000-0000-0444,32.82,Latte,ANON,0444
1131,2024-07-31,2024-07-31 21:54:11.824,card,ANON-0000-0000-0445,32.82,Latte,ANON,0445


In [523]:
money_type = index_data1.groupby(['cash_type'])['money'].max()
money_type

Unnamed: 0_level_0,money
cash_type,Unnamed: 1_level_1
card,38.7
cash,40.0


In [542]:
max_coffee_purchased = index_data1['coffee_name'].value_counts()
max_coffee_purchased

Unnamed: 0_level_0,count
coffee_name,Unnamed: 1_level_1
Americano with Milk,268
Latte,243
Cappuccino,196
Americano,169
Cortado,99
Hot Chocolate,74
Espresso,49
Cocoa,35


In [552]:
# How much coffee was purchased on what date

coffee = index_data1.groupby(['coffee_name'])['date'].max()
coffee

Unnamed: 0_level_0,date
coffee_name,Unnamed: 1_level_1
Americano,2024-07-31
Americano with Milk,2024-07-31
Cappuccino,2024-07-31
Cocoa,2024-07-31
Cortado,2024-07-31
Espresso,2024-07-30
Hot Chocolate,2024-07-29
Latte,2024-07-31


In [553]:
index_data1

Unnamed: 0,date,datetime,cash_type,card,money,coffee_name,card_name,card_no
0,2024-03-01,2024-03-01 10:15:50.520,card,ANON-0000-0000-0001,38.70,Latte,ANON,0001
1,2024-03-01,2024-03-01 12:19:22.539,card,ANON-0000-0000-0002,38.70,Hot Chocolate,ANON,0002
2,2024-03-01,2024-03-01 12:20:18.089,card,ANON-0000-0000-0002,38.70,Hot Chocolate,ANON,0002
3,2024-03-01,2024-03-01 13:46:33.006,card,ANON-0000-0000-0003,28.90,Americano,ANON,0003
4,2024-03-01,2024-03-01 13:48:14.626,card,ANON-0000-0000-0004,38.70,Latte,ANON,0004
...,...,...,...,...,...,...,...,...
1128,2024-07-31,2024-07-31 20:53:35.077,card,ANON-0000-0000-0443,23.02,Cortado,ANON,0443
1129,2024-07-31,2024-07-31 20:59:25.013,card,ANON-0000-0000-0040,27.92,Americano with Milk,ANON,0040
1130,2024-07-31,2024-07-31 21:26:26.000,card,ANON-0000-0000-0444,32.82,Latte,ANON,0444
1131,2024-07-31,2024-07-31 21:54:11.824,card,ANON-0000-0000-0445,32.82,Latte,ANON,0445


In [555]:
index_data1['date'] = pd.to_datetime(index_data1['date'])
index_data1

Unnamed: 0,date,datetime,cash_type,card,money,coffee_name,card_name,card_no
0,2024-03-01,2024-03-01 10:15:50.520,card,ANON-0000-0000-0001,38.70,Latte,ANON,0001
1,2024-03-01,2024-03-01 12:19:22.539,card,ANON-0000-0000-0002,38.70,Hot Chocolate,ANON,0002
2,2024-03-01,2024-03-01 12:20:18.089,card,ANON-0000-0000-0002,38.70,Hot Chocolate,ANON,0002
3,2024-03-01,2024-03-01 13:46:33.006,card,ANON-0000-0000-0003,28.90,Americano,ANON,0003
4,2024-03-01,2024-03-01 13:48:14.626,card,ANON-0000-0000-0004,38.70,Latte,ANON,0004
...,...,...,...,...,...,...,...,...
1128,2024-07-31,2024-07-31 20:53:35.077,card,ANON-0000-0000-0443,23.02,Cortado,ANON,0443
1129,2024-07-31,2024-07-31 20:59:25.013,card,ANON-0000-0000-0040,27.92,Americano with Milk,ANON,0040
1130,2024-07-31,2024-07-31 21:26:26.000,card,ANON-0000-0000-0444,32.82,Latte,ANON,0444
1131,2024-07-31,2024-07-31 21:54:11.824,card,ANON-0000-0000-0445,32.82,Latte,ANON,0445


In [557]:
index_data1['month_name'] = index_data1['date'].dt.month_name()
index_data1

Unnamed: 0,date,datetime,cash_type,card,money,coffee_name,card_name,card_no,month_name
0,2024-03-01,2024-03-01 10:15:50.520,card,ANON-0000-0000-0001,38.70,Latte,ANON,0001,March
1,2024-03-01,2024-03-01 12:19:22.539,card,ANON-0000-0000-0002,38.70,Hot Chocolate,ANON,0002,March
2,2024-03-01,2024-03-01 12:20:18.089,card,ANON-0000-0000-0002,38.70,Hot Chocolate,ANON,0002,March
3,2024-03-01,2024-03-01 13:46:33.006,card,ANON-0000-0000-0003,28.90,Americano,ANON,0003,March
4,2024-03-01,2024-03-01 13:48:14.626,card,ANON-0000-0000-0004,38.70,Latte,ANON,0004,March
...,...,...,...,...,...,...,...,...,...
1128,2024-07-31,2024-07-31 20:53:35.077,card,ANON-0000-0000-0443,23.02,Cortado,ANON,0443,July
1129,2024-07-31,2024-07-31 20:59:25.013,card,ANON-0000-0000-0040,27.92,Americano with Milk,ANON,0040,July
1130,2024-07-31,2024-07-31 21:26:26.000,card,ANON-0000-0000-0444,32.82,Latte,ANON,0444,July
1131,2024-07-31,2024-07-31 21:54:11.824,card,ANON-0000-0000-0445,32.82,Latte,ANON,0445,July


In [562]:
max_coffee_sell = index_data1.groupby(['month_name'])['coffee_name'].value_counts().reset_index()
max_coffee_sell

Unnamed: 0,month_name,coffee_name,count
0,April,Cappuccino,43
1,April,Americano with Milk,42
2,April,Americano,35
3,April,Latte,31
4,April,Cortado,19
5,April,Hot Chocolate,13
6,April,Espresso,7
7,April,Cocoa,6
8,July,Americano with Milk,65
9,July,Latte,56


In [563]:
food_production_data

Unnamed: 0,Entity,Year,Maize Production (tonnes),Rice Production ( tonnes),Yams Production (tonnes),Wheat Production (tonnes),Tomatoes Production (tonnes),Tea Production ( tonnes ),Sweet potatoes Production (tonnes),Sunflower seed Production (tonnes),...,Oranges Production (tonnes),"Peas, dry Production ( tonnes)",Palm oil Production (tonnes),Grapes Production (tonnes),"Coffee, green Production ( tonnes)",Cocoa beans Production (tonnes),"Meat, chicken Production (tonnes)",Bananas Production ( tonnes),Avocados Production (tonnes),Apples Production (tonnes)
0,Afghanistan,1961,700000.0,319000.00,7.467702e+06,2279000.00,1873812.000,56315.0,3270871.000,12000.000,...,10100.000,232910.000,1131882.00,2.250000e+05,870970.000,835368.000,5600.00,3139079.000,63439.000,15100.000
1,Afghanistan,1962,700000.0,319000.00,7.420515e+06,2279000.00,2044797.000,61519.0,3562524.000,12800.000,...,10100.000,259412.000,1111006.00,2.250000e+05,883512.000,867170.000,6000.00,3181580.000,65118.000,15100.000
2,Afghanistan,1963,713000.0,319000.00,8.479074e+06,1947000.00,2096077.000,63596.0,3409916.000,12800.000,...,10100.000,251529.000,1145004.00,2.250000e+05,996674.000,922621.000,6160.00,3304256.000,61760.000,15100.000
3,Afghanistan,1964,720000.0,380000.00,9.113779e+06,2230000.00,2388264.000,66604.0,3229336.000,12800.000,...,12400.000,247556.000,1160831.00,2.650000e+05,1162048.000,1190061.000,6400.00,3392527.000,62759.000,18400.000
4,Afghanistan,1965,720000.0,380000.00,1.006791e+07,2282000.00,2559608.000,72418.0,3169104.000,13200.000,...,13700.000,266947.000,1138860.00,2.870000e+05,1075084.000,874245.000,6800.00,3450849.000,66269.000,20400.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11907,Zimbabwe,2017,1532572.0,20502618.00,2.673608e+05,30991516.16,324020.864,83904.6,874.408,4077308.066,...,129055.800,40749.600,661978.98,7.018985e+06,63.200,34945.000,75000.00,1267.200,145042.096,1579154.000
11908,Zimbabwe,2018,1560100.0,31871776.60,2.667498e+05,16884482.98,309316.252,66842.6,1383.792,2107639.306,...,168074.000,21606.400,485223.38,3.415107e+06,68.000,34659.800,80357.14,1016.776,143384.384,907419.000
11909,Zimbabwe,2019,773976.0,3382151.40,4.790022e+05,64989733.20,540128.592,9854.4,298831.560,3320915.400,...,1134136.812,45436.400,361128.20,9.645766e+06,30401.400,20961.132,79790.97,195845.198,525095.272,3117813.560
11910,Zimbabwe,2020,1202347.0,3535631.80,7.437612e+05,43020887.20,5635740.200,276279.0,1727311.800,983256.000,...,72179.400,12023.200,279613.40,1.131937e+07,574723.374,158022.122,8000.00,1523258.600,28946.840,7994169.220


In [566]:
food_production_data.columns

Index(['Entity', 'Year', 'Maize Production (tonnes)',
       'Rice  Production ( tonnes)', 'Yams  Production (tonnes)',
       'Wheat Production (tonnes)', 'Tomatoes Production (tonnes)',
       'Tea  Production ( tonnes )', 'Sweet potatoes  Production (tonnes)',
       'Sunflower seed  Production (tonnes)', 'Sugar cane Production (tonnes)',
       'Soybeans  Production (tonnes)', 'Rye  Production (tonnes)',
       'Potatoes  Production (tonnes)', 'Oranges  Production (tonnes)',
       'Peas, dry Production ( tonnes)', 'Palm oil  Production (tonnes)',
       'Grapes  Production (tonnes)', 'Coffee, green Production ( tonnes)',
       'Cocoa beans Production (tonnes)', 'Meat, chicken  Production (tonnes)',
       'Bananas  Production ( tonnes)', 'Avocados Production (tonnes)',
       'Apples Production (tonnes)'],
      dtype='object')

In [578]:
city_max_maize = food_production_data.groupby(['Entity'])['Maize Production (tonnes)'].value_counts().reset_index()
city_max_maize

Unnamed: 0,Entity,Maize Production (tonnes),count
0,Afghanistan,720000.0,4
1,Afghanistan,300000.0,3
2,Afghanistan,360000.0,3
3,Afghanistan,760000.0,3
4,Afghanistan,316000.0,2
...,...,...,...
11130,Zimbabwe,2341209.0,1
11131,Zimbabwe,2357152.0,1
11132,Zimbabwe,2545600.0,1
11133,Zimbabwe,2828000.0,1
