In [2]:
# Import libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from flask import Flask, jsonify, request

# Load data
data = pd.read_csv('./data-small-more-patterns.csv')

# Separate features and target variable
X = data.drop(['Target-MCC1'], axis=1)
y = data['Target-MCC1']

# Label encode categorical features
cat_features = ['AgeGroup', 'MaritalStatus', 'Day', 'Gender', 'City']
for feature in cat_features:
    le = LabelEncoder()
    X[feature] = le.fit_transform(X[feature])

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Initialize Flask app
app = Flask(__name__)

# Define API endpoint for making predictions
@app.route('/predict', methods=['POST'])
def predict():
    # Get data from request
    data = request.get_json()
    data = pd.DataFrame.from_dict(data)
    
    # Label encode categorical features
    for feature in cat_features:
        le = LabelEncoder()
        data[feature] = le.fit_transform(data[feature])
    
    # Make prediction using trained model
    pred = model.predict(data)
    
    # Return prediction as JSON
    return jsonify({'prediction': list(pred)})

# Start Flask app
if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Load data
data = pd.read_csv('./Data_2000_SA.csv')

# Separate features and target variable
X = data.drop(['Target-MCC1'], axis=1)
y = data['Target-MCC1']

# Label encode categorical features
cat_features = ['AgeGroup', 'MaritalStatus', 'Day', 'Gender', 'City']
le_dict = {}
for feature in cat_features:
    le = LabelEncoder()
    X[feature] = le.fit_transform(X[feature])
    le_dict[feature] = le


# Load data and split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train random forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Map feature names to importances
feature_names = ['AgeGroup', 'MaritalStatus', 'Day', 'Gender', 'City', 
                 'Avg-7832', 'Avg-5411', 'Avg-5812', 'Avg-7032', 'Avg-5983',
                 'Avg-4111', 'Avg-5999', 'Avg-5691', 'Avg-4814', 'Avg-5621']
importances_dict = dict(zip(feature_names, importances))

# Print importance scores for each feature
for feature, importance in importances_dict.items():
    print(f"{feature}: {importance}")


AgeGroup: 0.007335352846323242
MaritalStatus: 0.003780795258948365
Day: 0.006009442644614128
Gender: 0.004859433183447916
City: 0.006545369156478366
Avg-7832: 0.46691761668599463
Avg-5411: 0.03854568023937666
Avg-5812: 0.01321235913167839
Avg-7032: 0.2247785109812169
Avg-5983: 0.01376515795764316
Avg-4111: 0.10336592425084284
Avg-5999: 0.03103362660596684
Avg-5691: 0.016577535325028935
Avg-4814: 0.013617798502985055
Avg-5621: 0.04965539722945451


In [None]:
# # It is difficult to say whether a feature importance value is "bad" or not without more context. 
# However, the values you have provided suggest that "Avg-7832" is the most important feature for predicting the
# target variable in the model, followed by "Avg-7032" and "Avg-4111", while "MaritalStatus" and "Gender" 
# have relatively low importance.

# # It is important to note that feature importance values should be interpreted relative to each other
# #  and the specific context of the problem being solved. A low feature importance value does not necessarily
# #  mean that the feature is not useful for prediction, but rather that it may have less predictive power than 
# # other features in the model. Conversely, a high feature importance value does not necessarily mean that a
# #  feature is the most important for prediction in all contexts.

In [3]:
import requests


data = {
    'AgeGroup': ['30'],
    'MaritalStatus': ['Single'],
    'Day': ['Weekday'],
    'Gender': ['M'],
    'City': ['ISL'],
    'Avg-7832': ['0'],
    'Avg-5411': ['10'],
    'Avg-5812': ['0'],
    'Avg-7032': ['70'],
    'Avg-5983': ['20'],
    'Avg-4111': ['0'],
    'Avg-5999': ['0'],
    'Avg-5691': ['0'],
    'Avg-4814':['0'],
    'Avg-5621': ['0']
}




# Send POST request to Flask app
response = requests.post('http://localhost:5000/predict', json=data)

# Print prediction from response
print(response.json())


{'predictions': [{'class': '7032', 'probability': '0.88'}, {'class': '5411', 'probability': '0.05'}, {'class': '7832', 'probability': '0.04'}]}
