In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
from collections import Counter

In [2]:
def find_transitions(df):
    
    flight_pairs = list(zip(df['ORG_CITY_NM'], df['CITY_NM']))
    pair_counts = Counter(flight_pairs)
    # all_cities = set(list(df['ORG_CITY_NM'].unique()) + list(df['CITY_NM'].unique()))
    shape = (len(all_cities_list), len(all_cities_list))
    
    transition_df = pd.DataFrame(np.zeros(shape),index=sorted(all_cities_list), columns=sorted(all_cities_list))
    
    for k, v in pair_counts.items():
        transition_df.loc[k[0], k[1]] = v
    
    return transition_df.div(transition_df.sum(axis=1)+0.0000001, axis=0)

In [3]:
def find_customer_transitions(cust_df):
    return find_transitions(cust_df)

In [4]:
def find_seasonal_transitions(df, season):
    seasonal_df = df[df['season']==season]
    return find_transitions(seasonal_df)

In [5]:
dtypes = {'CUST_KEY': 'int32',
          'ORG_CITY_NM': 'category',
          'CITY_NM': 'category',
          'DOM_INTNL_FLAG': 'category',
          'JRNY_TYP': 'category',
          'TOP1_ORG_MKT_REGN': 'category',
          'TOP1_ORG_CTY': 'category'}
dates = ['SEG_LCL_DEP_DT']

In [6]:
replace_for_errors= {'ABÃÂ°DJAN':'ABIDJAN',
                     'AB�DJAN':'ABIDJAN',
                     '─NGELHOLM': 'ANGELHOLM',
                    'ADDÃÂ°S ABABA' : 'ADDIS ABABA',
                     'ADD�S ABABA' : 'ADDIS ABABA',
                     'ASUNCIÃÂ³N': 'ASUNCION',
                     'CHAPEC≤': 'CHAPECO',
                     'CÃÂºCUTA': 'CUCUTA',
                     'HO CHÃÂ° MÃÂ°NH CÃÂ°TY (SAÃÂ°GO' : 'HO CHI MINH CITY (SAIGON)',
                     'HO CH� M�NH C�TY (SA�GON)' : 'HO CHI MINH CITY (SAIGON)',
                     'HAGΣT±A, GUAM': 'HAGATNA, GUAM',
                     'ILHΘUS': 'ILHEUS',
                     'LEFKOÃÅ¾A' : 'LEFKOSA',
                     'LEFKO�A':'LEFKOSA',
                     'LÃÂ°LONGWE': 'LILONGWE',
                     'L�LONGWE': 'LILONGWE',
                     'LINK÷PING': 'LINKOPING',
                    'KÃÂ°LÃÂ°MANJARO' : 'KILIMANJARO',
                     'K�L�MANJARO': 'KILIMANJARO',
                    'MOGADÃÂ°SHU' : 'MOGADISHU',
                     'MOGAD�SHU' : 'MOGADISHU',
                     'JOÃÂ£O PESSOA': 'JOAO PESSOA',
                     'MALÃÂ°': 'MALI',
                     'MÃÂ°SURATA': 'MISURATA',
                     'M�SURATA': 'MISURATA',
                     'MAZAR-E SHARÃÂ°F': 'MAZAR-E SHARIF',
                     'MAZAR-E SHAR�F': 'MAZAR-E SHARIF',
                     'NOVOSÃÂ°BÃÂ°RSK': 'NOVOSIBIRSK',
                     'NOVOS�B�RSK': 'NOVOSIBIRSK',
                     'OVÃÂ°EDO': 'OVIEDO',
                     'OV�EDO': 'OVIEDO',
                     'OKÃÂ°NAWA': 'OKINAWA',
                     'OK�NAWA': 'OKINAWA',
                     'ÃÂ�STERSUND': 'OSTERSUND',
                     'REYKJAVÃÂ­K': 'REYKJAVIK',
                    'SÃÂ°NOP' : 'SINOP',
                     'S�NOP': 'SINOP',
                    'SULEYMANÃÂ°AH' : 'SULEYMANIAH',
                     'SULEYMAN�AH': 'SULEYMANIAH',
                    'SHÃÂ°RAZ' : 'SHIRAZ',
                     'SH�RAZ': 'SHIRAZ',
                    'SHARM EL-SHEÃÂ°KH' : 'SHARM EL-SHEIKH',
                     'SHARM EL-SHE�KH': 'SHARM EL-SHEIKH',
                     'SÃÂ£O LUÃÂ­S': 'SAO LUIS',
                     'SEVÃÂ°LLE': 'SEVILLE',
                     'SEV�LLE': 'SEVILLE',
                     'SPLÃÂ°T':'SPLIT',
                     'SPL�T':'SPLIT',
                     'SÃÂ°ÃÂ°RT' : 'SIIRT',
                     'S��RT':'SIIRT',
                     'SΠO JOSΘ DO RIO PRETO': 'SAO JOSE DO RIO PRETO',
                     'SAN JOSÃÂ© DEL CABO': 'SAN JOSE DEL CABO',
                     'SAN PEDRO SULA\t': 'SAN PEDRO SULA',                     
                     'SAN ANDRΘS': 'SAN ANDROS',
                    'TURÃÂ°N(TORÃÂ°NO)' : 'TURIN(TORINO)',
                     'TUR�N(TOR�NO)': 'TURIN(TORINO)',
                    'TEKRDAG' : 'TEKIRDAG',
                     'TEK�RDAG': 'TEKIRDAG',
                     'TAÃÂ°F': 'TAIF',
                     'TA�F': 'TAIF',
                     'TAPA': 'TAIPA',
                     'UBERLÃÂ¢NDIA':'UBERLANDIA',
                    'THESSALONÃÂ°KÃÂ°' : 'THESSALONIKI',
                     'THESSALON�K�': 'THESSALONIKI',
                    'VALENCÃÂ°A' : 'VALENCIA',
                     'VALENC�A': 'VALENCIA',
                    'VITÃÂ³RIA': 'VITORIA'}

replace_for_errors = {k.lower(): v.lower() for k,v in replace_for_errors.items()}

In [7]:
seasons = {'winter':[12,1,2], 
           'spring':[3,4,5], 
           'summer':[6,7,8],
           'fall':[9,10,11]}

season_mapping = {month:k for k,v in seasons.items() for month in v}
season_mapping

{12: 'winter',
 1: 'winter',
 2: 'winter',
 3: 'spring',
 4: 'spring',
 5: 'spring',
 6: 'summer',
 7: 'summer',
 8: 'summer',
 9: 'fall',
 10: 'fall',
 11: 'fall'}

In [8]:
path = r'BIR_SONRAKI_DEST_VALSET.csv'
cols = ['CUST_KEY', 'SEG_LCL_DEP_DT', 'CITY_NM', 'ORG_CITY_NM', 'TOP1_ORG_CTY']
val = pd.read_csv(path, sep='|', parse_dates=dates, usecols=cols)

In [9]:
for col in ['ORG_CITY_NM', 'CITY_NM', 'TOP1_ORG_CTY']:
    val[col] = val[col].str.lower()

In [10]:
val.replace(replace_for_errors, inplace=True)

In [11]:
val['season'] = val['SEG_LCL_DEP_DT'].apply(lambda x: season_mapping[x.month]).astype('category')

In [12]:
val.dropna(inplace=True)

In [13]:
val.head()

Unnamed: 0,CUST_KEY,SEG_LCL_DEP_DT,CITY_NM,ORG_CITY_NM,TOP1_ORG_CTY,season
0,5238184,2021-06-19,ankara,blantyre,ankara,summer
1,5435557,2021-05-22,istanbul,ankara,istanbul,spring
2,5528575,2021-06-30,adana,istanbul,adana,summer
3,5609847,2021-03-02,istanbul,malatya,istanbul,spring
4,5719611,2021-03-19,ankara,suleymaniah,ankara,spring


In [14]:
alt_cities_path = r'AlternatifSehirler_BirSonrakiDestinasyonTahminleme.xlsx'

replace_for_alter_cities = {'ordu_giresun':'ordu-giresun',
                            'dar_es_salaam':'dar es salaam',
                            'dalaman_mugla':'dalaman (mugla)',
                            'tel_aviv':'tel aviv',
                            'sharm_el_sheikh':'sharm el-sheikh',
                            'abu_dhabi':'abu dhabi',
                           'basel_mulhouse':'basel/mulhouse',
                           'amasya___merzifon':'amasya/merzifon',
                           'balikesir_edremit':'balikesir edremit'}

alter_cities = pd.read_excel(alt_cities_path, usecols=['City_1', 'City_2'])

for col in alter_cities.columns:
    alter_cities[col] = alter_cities[col].str.lower()
alter_cities.replace(replace_for_alter_cities, inplace=True)

In [None]:
import pickle
with open("general_transitions.p","rb") as f:
    general_transitions = pickle.load(f)
with open("seasonal_transitions.p","rb") as f:
    seasonal_transitions = pickle.load(f)

In [None]:
with open("train_df.p","rb") as f:
    df = pickle.load(f)

In [17]:
# finding all unique cities
all_cities_list = list(general_transitions.index)

In [18]:
def add_dataframes(general_df, seasonal_df, customer_df):
    
    tmp_df = pd.DataFrame(np.zeros(general_df.shape), 
                          columns=general_df.columns, index=general_df.index)
    seasonal_df = seasonal_df.add(tmp_df, fill_value=0.0)
    customer_df = customer_df.add(tmp_df, fill_value=0.0)
    
    return general_df + seasonal_df + customer_df

In [19]:
def predict(row, pred_dep='origin', season=None, n_pred=7, pred_type='only_cities', filter_alternate=True):
    """
    Parameters:
    -----------
    row: Series
        Data point to be predicted
        
    pred_dep: {'origin', 'top_city'}, default 'origin'
        Which city will be used for predicting the next flight. 
        'origin' refers to using test flight's origin city.
        'top_city' refers to using 'TOP1_ORG_CTY' of the customer.
        
    season: {None, 'winter', 'summer', 'spring', 'fall'}, default None
        Season param for using seasonal transition matrix. 
        None refers to using test flight's season
        
    n_pred: int, default 7
        Number of cities that model suggest to the customer.
        
    pred_type: {'only_cities', 'with_probs'}, default 'only_cities'
        'only_cities': returns only the top predicted cities
        'with_probs': returns predictions with probabilities
    
    filter_alternate: {True, False}, default True
        filters alternative cities from suggestions
    Returns:
    --------
    final_preds: dict or list
        if pred_type param is 'only_cities' it returns only predicted cities (list)
        if pred_type param is 'with_probs' it returns predicted cities with probabilities (dict)
        
    """
    
    cust_key = row['CUST_KEY']
    cust_df = df[df['CUST_KEY']==cust_key]

    # customer transitions
    cust_transitions = find_customer_transitions(cust_df)

    # seasonal customer transitions
    if season==None:
        season = row['season']
    if not season in seasons.keys():
        print('Wrong season')
        return False
    cust_seasonal_transitions = find_seasonal_transitions(cust_df, season)

    # weights
    w1 = 1
    w2 = len(cust_df)
    w3 = 2*w2
    total_sum = (w1+w2+w3)
    w1 = w1/total_sum
    w2 = w2/total_sum
    w3 = w3/total_sum
    

    if pred_dep == 'origin':
        departure = row['ORG_CITY_NM']
    elif pred_dep == 'top_city':
        departure = row['TOP1_ORG_CTY']
    else:
        print('Season param is wrong!')
        return False   
    
    # final_transitions = add_dataframes(w1*general_transitions, w2*seasonal_transitions[season], 
    #                                   w3*cust_seasonal_transitions)
    final_transitions = w1*general_transitions.values+w2*seasonal_transitions[season].values+w3*cust_seasonal_transitions.values
    final_transitions = pd.DataFrame(final_transitions, index=general_transitions.index, columns=general_transitions.index) 
    # final_transitions.loc[departure].sort_values(ascending=False)[0:n_pred]
    
    preds = final_transitions.loc[departure].sort_values(ascending=False)
    filtered_preds = []
    alternative_cities = []
    for city, prob in preds.items():
        
        if not city in alternative_cities:
            filtered_preds.append((city, prob))
            alternative_cities.extend(alter_cities.loc[alter_cities['City_1']==city, 'City_2'].values)
            
        if len(filtered_preds)>=n_pred:
            break
            
    final_preds = dict(filtered_preds)
    
    if pred_type == 'only_cities':
        return list(final_preds.keys())
    elif pred_type == 'with_probs':
        return final_preds
    else:
        print('Wrong return type!')
        return False

In [20]:
def calculate_accuracy(y_test, y_pred):
    trues = 0
    for true, pred in zip(y_test, y_pred):
        if true in pred:
            trues += 1
    return trues/len(y_test)

In [22]:
import random
sample_indexes = random.sample(list(val.index), 100_000)

In [26]:
sample_size = 10_000
final_results = {}
final_accs = {}

for i in range(0, 100_000, sample_size):
    
    data = val.loc[sample_indexes[i: i+sample_size]]
    results = data.apply(predict, axis=1, pred_type='only_cities', pred_dep='origin')
    acc = calculate_accuracy(data['CITY_NM'], results)
    final_results[i] = results
    final_accs[i] = acc
    print(i, ': ', acc)

0 :  0.7588010460671897
10000 :  0.76
20000 :  0.7533514766656587
30000 :  0.7529222087867795
40000 :  0.7434243676307568
50000 :  0.7487158827676503
60000 :  0.7553855445943225
70000 :  0.6653857821822227
80000 :  0.6883077544426495
90000 :  0.7538027601490883


In [30]:
means = np.array(list(final_accs.values()))
means.mean(), means.std()

(0.7380096823286318, 0.031326354600529294)

In [42]:
data['results'] = results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['results'] = results


In [46]:
calculate_accuracy(data['CITY_NM'],data['results'])

0.7538027601490883