In [160]:
import pandas as pd
import numpy as np

In [161]:
ufo_sightings_large = pd.read_csv('ufo_sightings_large.csv')
ufo_sightings_large.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,11/3/2011 19:21,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,10/3/2004 19:05,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,9/25/2009 21:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875
3,11/21/2002 05:45,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222
4,8/19/2010 12:55,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333


### Checking column types

In [162]:
ufo_sightings_large.dtypes

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object

In [163]:
ufo_sightings_large['date'] = pd.to_datetime(ufo_sightings_large['date'])
ufo_sightings_large.dtypes

date              datetime64[ns]
city                      object
state                     object
country                   object
type                      object
seconds                  float64
length_of_time            object
desc                      object
recorded                  object
lat                       object
long                     float64
dtype: object

### checking_missing_values

In [164]:
print((ufo_sightings_large.isnull().sum()/ufo_sightings_large.shape[0])*100)

date               0.000000
city               0.182371
state              8.490375
country           13.779129
type               3.221884
seconds            0.000000
length_of_time     2.897670
desc               0.060790
recorded           0.000000
lat                0.000000
long               0.000000
dtype: float64


In [165]:
ufo_sightings_large.shape

(4935, 11)

### Removing Missing Values

In [166]:
#ufo_no_missing = ufo_sightings_large[ufo_sightings_large["length_of_time"].notnull() & ufo_sightings_large["state"].notnull() & 
          #ufo_sightings_large["type"].notnull()]
#print(ufo_no_missing.shape)

### Categorical Variables and standardization

In [167]:
ufo_sightings_large["minutes"] = ufo_sightings_large["length_of_time"].str.extract("(\d+)", expand=True)
ufo_sightings_large[['length_of_time','minutes']].head()

Unnamed: 0,length_of_time,minutes
0,2 weeks,2.0
1,30sec.,30.0
2,,
3,about 5 minutes,5.0
4,2,2.0


### Identifying features for standardization

In [168]:
ufo_sightings_large = ufo_sightings_large[ufo_sightings_large['minutes'].notnull() & ufo_sightings_large["length_of_time"].notnull() & ufo_sightings_large["state"].notnull() & 
          ufo_sightings_large["type"].notnull()]
ufo_sightings_large['minutes'] = ufo_sightings_large['minutes'].astype('int')

In [169]:
ufo_sightings_large.var()

seconds    3.155433e+09
long       4.125837e+02
minutes    9.173689e+02
dtype: float64

In [170]:
print(ufo_sightings_large[['seconds','minutes']].var())
ufo_sightings_large["seconds_log"] = np.log(ufo_sightings_large['seconds'])
print(ufo_sightings_large["seconds_log"].var())

seconds    3.155433e+09
minutes    9.173689e+02
dtype: float64
nan


  


In [171]:
ufo_sightings_large.dropna()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long,minutes,seconds_log
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111,2,14.005800
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556,30,3.401197
3,2002-11-21 05:45:00,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222,5,5.703782
4,2010-08-19 12:55:00,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333,2,-inf
5,2012-06-16 23:00:00,san diego,ca,us,light,600.0,10 minutes,Dancing lights that would fly around and then ...,7/4/2012,32.7152778,-117.156389,10,6.396930
6,2009-07-12 21:30:00,duluth,mn,us,oval,600.0,total? maybe around 10 mi,A minor amber color trail&#44 (from where we w...,3/13/2012,46.7833333,-92.106389,10,6.396930
7,2008-10-20 18:30:00,fairfield,tx,us,other,0.0,several sightings from 10,Multiple sightings in Central Texas (Freestone...,1/10/2009,31.7244444,-96.165000,10,-inf
8,2013-06-09 00:00:00,oakville (canada),on,ca,light,120.0,2 minutes,Brilliant orange light or chinese lantern at o...,7/3/2013,43.433333,-79.666667,2,4.787492
9,2013-04-26 23:27:00,lacey,wa,us,light,120.0,2 minutes,Bright red light moving north to north west fr...,5/15/2013,47.0344444,-122.821944,2,4.787492
10,2013-09-13 20:30:00,ben avon,pa,us,sphere,300.0,5 minutes,North-east moving south-west. First 7 or so li...,9/30/2013,40.5080556,-80.083333,5,5.703782


In [172]:
ufo_sightings_large.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long,minutes,seconds_log
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111,2,14.0058
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556,30,3.401197
3,2002-11-21 05:45:00,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222,5,5.703782
4,2010-08-19 12:55:00,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333,2,-inf
5,2012-06-16 23:00:00,san diego,ca,us,light,600.0,10 minutes,Dancing lights that would fly around and then ...,7/4/2012,32.7152778,-117.156389,10,6.39693


### Feature_Engineering

### Encoding categorical variables

In [173]:
ufo_sightings_large['country'].unique()

array(['us', 'ca', nan, 'au'], dtype=object)

In [174]:
ufo_sightings_large["country_enc"] = ufo_sightings_large["country"].apply(lambda val: 1 if val == "us" else 0)

In [175]:
type_set = pd.get_dummies(ufo_sightings_large["type"])

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo_sightings_large, type_set], axis=1)

### Features from dates

In [176]:
print(ufo['date'].head())
ufo["month"] = ufo["date"].apply(lambda row: row.month)
ufo["year"] = ufo["date"].apply(lambda row: row.year)
print(ufo[['date', 'month','year']].head())

0   2011-11-03 19:21:00
1   2004-10-03 19:05:00
3   2002-11-21 05:45:00
4   2010-08-19 12:55:00
5   2012-06-16 23:00:00
Name: date, dtype: datetime64[ns]
                 date  month  year
0 2011-11-03 19:21:00     11  2011
1 2004-10-03 19:05:00     10  2004
3 2002-11-21 05:45:00     11  2002
4 2010-08-19 12:55:00      8  2010
5 2012-06-16 23:00:00      6  2012


### Text vectorization

In [177]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create the tfidf vectorizer object
vec = TfidfVectorizer()

# Use vec's fit_transform method on the desc field
desc_tfidf = vec.fit_transform(ufo['desc'])

# Look at the number of columns this creates
print(desc_tfidf.shape)

(3980, 5472)


In [178]:
ufo.columns

Index(['date', 'city', 'state', 'country', 'type', 'seconds', 'length_of_time',
       'desc', 'recorded', 'lat', 'long', 'minutes', 'seconds_log',
       'country_enc', 'changing', 'chevron', 'cigar', 'circle', 'cone',
       'cross', 'cylinder', 'diamond', 'disk', 'egg', 'fireball', 'flash',
       'formation', 'light', 'other', 'oval', 'rectangle', 'sphere',
       'teardrop', 'triangle', 'unknown', 'month', 'year'],
      dtype='object')

### Feature Selection

### Selecting the ideal dataset

In [179]:
print(ufo[['seconds','seconds_log','minutes']].corr())

              seconds  seconds_log   minutes
seconds      1.000000     0.165801 -0.009954
seconds_log  0.165801     1.000000  0.109269
minutes     -0.009954     0.109269  1.000000


In [183]:
# Make a list of features to drop   
to_drop = ["city", "country", "date", "desc", "lat", "length_of_time","type", "long", "minutes","seconds","recorded", "seconds", "state"]

# Drop those features
ufo_dropped = ufo.drop(to_drop, axis=1)

# Let's also filter some words out of the text vector we created
#filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)

### Modelling

    - Modeling the UFO dataset, part 1
    - we're going to build a k-nearest neighbor model to predict which country the UFO sighting took place in. 

In [184]:
X = ufo_dropped.drop(['country_enc'],axis = 1)
# Take a look at the features in the X set of data
print(X.columns)
y = ufo_dropped[['country_enc']]

Index(['seconds_log', 'changing', 'chevron', 'cigar', 'circle', 'cone',
       'cross', 'cylinder', 'diamond', 'disk', 'egg', 'fireball', 'flash',
       'formation', 'light', 'other', 'oval', 'rectangle', 'sphere',
       'teardrop', 'triangle', 'unknown', 'month', 'year'],
      dtype='object')


In [None]:
# Split the X and y sets using train_test_split, setting stratify=y
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X,y,stratify = y)

# Fit knn to the training sets
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(train_X,train_y)

# Print the score of knn on the test sets
print(rf.score(test_X,test_y))